pabsw
__m128i _mm_abs_epi16 (__m128i a)

Synopsis

__m128i _mm_abs_epi16 (__m128i a)
#include "tmmintrin.h"
Instruction: pabsw xmm, xmm
CPUID Flags: SSSE3

Description

Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := ABS(a[i+15:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpabsw
__m128i _mm_mask_abs_epi16 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_abs_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := ABS(a[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpabsw
__m128i _mm_maskz_abs_epi16 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_abs_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := ABS(a[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpabsw
__m256i _mm256_abs_epi16 (__m256i a)

Synopsis

__m256i _mm256_abs_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpabsw ymm, ymm
CPUID Flags: AVX2

Description

Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := ABS(a[i+15:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpabsw
__m256i _mm256_mask_abs_epi16 (__m256i src, __mmask16 k, __m256i a)

Synopsis

__m256i _mm256_mask_abs_epi16 (__m256i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := ABS(a[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpabsw
__m256i _mm256_maskz_abs_epi16 (__mmask16 k, __m256i a)

Synopsis

__m256i _mm256_maskz_abs_epi16 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := ABS(a[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpabsw
__m512i _mm512_abs_epi16 (__m512i a)

Synopsis

__m512i _mm512_abs_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512BW

Description

Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 31 i := j*16 dst[i+15:i] := ABS(a[i+15:i]) ENDFOR dst[MAX:512] := 0
vpabsw
__m512i _mm512_mask_abs_epi16 (__m512i src, __mmask32 k, __m512i a)

Synopsis

__m512i _mm512_mask_abs_epi16 (__m512i src, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512BW

Description

Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := ABS(a[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpabsw
__m512i _mm512_maskz_abs_epi16 (__mmask32 k, __m512i a)

Synopsis

__m512i _mm512_maskz_abs_epi16 (__mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512BW

Description

Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := ABS(a[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
pabsd
__m128i _mm_abs_epi32 (__m128i a)

Synopsis

__m128i _mm_abs_epi32 (__m128i a)
#include "tmmintrin.h"
Instruction: pabsd xmm, xmm
CPUID Flags: SSSE3

Description

Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ABS(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpabsd
__m128i _mm_mask_abs_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_abs_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpabsd
__m128i _mm_maskz_abs_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_abs_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpabsd
__m256i _mm256_abs_epi32 (__m256i a)

Synopsis

__m256i _mm256_abs_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpabsd ymm, ymm
CPUID Flags: AVX2

Description

Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ABS(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpabsd
__m256i _mm256_mask_abs_epi32 (__m256i src, __mmask8 k, __m256i a)

Synopsis

__m256i _mm256_mask_abs_epi32 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpabsd
__m256i _mm256_maskz_abs_epi32 (__mmask8 k, __m256i a)

Synopsis

__m256i _mm256_maskz_abs_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpabsd
__m512i _mm512_abs_epi32 (__m512i a)

Synopsis

__m512i _mm512_abs_epi32 (__m512i a)
#include "immintrin.h"
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ABS(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vpabsd
__m512i _mm512_mask_abs_epi32 (__m512i src, __mmask16 k, __m512i a)

Synopsis

__m512i _mm512_mask_abs_epi32 (__m512i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpabsd
__m512i _mm512_maskz_abs_epi32 (__mmask16 k, __m512i a)

Synopsis

__m512i _mm512_maskz_abs_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpabsq
__m128i _mm_abs_epi64 (__m128i a)

Synopsis

__m128i _mm_abs_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ABS(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vpabsq
__m128i _mm_mask_abs_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_abs_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpabsq
__m128i _mm_maskz_abs_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_abs_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpabsq
__m256i _mm256_abs_epi64 (__m256i a)

Synopsis

__m256i _mm256_abs_epi64 (__m256i a)
#include "immintrin.h"
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ABS(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vpabsq
__m256i _mm256_mask_abs_epi64 (__m256i src, __mmask8 k, __m256i a)

Synopsis

__m256i _mm256_mask_abs_epi64 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpabsq
__m256i _mm256_maskz_abs_epi64 (__mmask8 k, __m256i a)

Synopsis

__m256i _mm256_maskz_abs_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpabsq
__m512i _mm512_abs_epi64 (__m512i a)

Synopsis

__m512i _mm512_abs_epi64 (__m512i a)
#include "immintrin.h"
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ABS(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vpabsq
__m512i _mm512_mask_abs_epi64 (__m512i src, __mmask8 k, __m512i a)

Synopsis

__m512i _mm512_mask_abs_epi64 (__m512i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpabsq
__m512i _mm512_maskz_abs_epi64 (__mmask8 k, __m512i a)

Synopsis

__m512i _mm512_maskz_abs_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
pabsb
__m128i _mm_abs_epi8 (__m128i a)

Synopsis

__m128i _mm_abs_epi8 (__m128i a)
#include "tmmintrin.h"
Instruction: pabsb xmm, xmm
CPUID Flags: SSSE3

Description

Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := ABS(a[i+7:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpabsb
__m128i _mm_mask_abs_epi8 (__m128i src, __mmask16 k, __m128i a)

Synopsis

__m128i _mm_mask_abs_epi8 (__m128i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := ABS(a[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpabsb
__m128i _mm_maskz_abs_epi8 (__mmask16 k, __m128i a)

Synopsis

__m128i _mm_maskz_abs_epi8 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := ABS(a[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpabsb
__m256i _mm256_abs_epi8 (__m256i a)

Synopsis

__m256i _mm256_abs_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpabsb ymm, ymm
CPUID Flags: AVX2

Description

Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := ABS(a[i+7:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpabsb
__m256i _mm256_mask_abs_epi8 (__m256i src, __mmask32 k, __m256i a)

Synopsis

__m256i _mm256_mask_abs_epi8 (__m256i src, __mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := ABS(a[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpabsb
__m256i _mm256_maskz_abs_epi8 (__mmask32 k, __m256i a)

Synopsis

__m256i _mm256_maskz_abs_epi8 (__mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := ABS(a[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpabsb
__m512i _mm512_abs_epi8 (__m512i a)

Synopsis

__m512i _mm512_abs_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512BW

Description

Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 63 i := j*8 dst[i+7:i] := ABS(a[i+7:i]) ENDFOR dst[MAX:512] := 0
vpabsb
__m512i _mm512_mask_abs_epi8 (__m512i src, __mmask64 k, __m512i a)

Synopsis

__m512i _mm512_mask_abs_epi8 (__m512i src, __mmask64 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512BW

Description

Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := ABS(a[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpabsb
__m512i _mm512_maskz_abs_epi8 (__mmask64 k, __m512i a)

Synopsis

__m512i _mm512_maskz_abs_epi8 (__mmask64 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512BW

Description

Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := ABS(a[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpandq
__m512d _mm512_abs_pd (__m512d v2)

Synopsis

__m512d _mm512_abs_pd (__m512d v2)
#include "immintrin.h"
Instruction: vpandq zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ABS(v2[i+63:i]) ENDFOR dst[MAX:512] := 0
vpandq
__m512d _mm512_mask_abs_pd (__m512d src, __mmask8 k, __m512d v2)

Synopsis

__m512d _mm512_mask_abs_pd (__m512d src, __mmask8 k, __m512d v2)
#include "immintrin.h"
Instruction: vpandq zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ABS(v2[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
pabsw
__m64 _mm_abs_pi16 (__m64 a)

Synopsis

__m64 _mm_abs_pi16 (__m64 a)
#include "tmmintrin.h"
Instruction: pabsw mm, mm
CPUID Flags: SSSE3

Description

Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 3 i := j*16 dst[i+15:i] := ABS(a[i+15:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
pabsd
__m64 _mm_abs_pi32 (__m64 a)

Synopsis

__m64 _mm_abs_pi32 (__m64 a)
#include "tmmintrin.h"
Instruction: pabsd mm, mm
CPUID Flags: SSSE3

Description

Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 1 i := j*32 dst[i+31:i] := ABS(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
pabsb
__m64 _mm_abs_pi8 (__m64 a)

Synopsis

__m64 _mm_abs_pi8 (__m64 a)
#include "tmmintrin.h"
Instruction: pabsb mm, mm
CPUID Flags: SSSE3

Description

Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 7 i := j*8 dst[i+7:i] := ABS(a[i+7:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
vpandd
__m512 _mm512_abs_ps (__m512 v2)

Synopsis

__m512 _mm512_abs_ps (__m512 v2)
#include "immintrin.h"
Instruction: vpandd zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ABS(v2[i+31:i]) ENDFOR dst[MAX:512] := 0
vpandd
__m512 _mm512_mask_abs_ps (__m512 src, __mmask16 k, __m512 v2)

Synopsis

__m512 _mm512_mask_abs_ps (__m512 src, __mmask16 k, __m512 v2)
#include "immintrin.h"
Instruction: vpandd zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ABS(v2[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_acos_pd (__m128d a)

Synopsis

__m128d _mm_acos_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ACOS(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_acos_pd (__m256d a)

Synopsis

__m256d _mm256_acos_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ACOS(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_acos_pd (__m512d a)

Synopsis

__m512d _mm512_acos_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ACOS(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_acos_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_acos_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ACOS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_acos_ps (__m128 a)

Synopsis

__m128 _mm_acos_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ACOS(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_acos_ps (__m256 a)

Synopsis

__m256 _mm256_acos_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ACOS(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_acos_ps (__m512 a)

Synopsis

__m512 _mm512_acos_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ACOS(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_acos_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_acos_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ACOS(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_acosh_pd (__m128d a)

Synopsis

__m128d _mm_acosh_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ACOSH(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_acosh_pd (__m256d a)

Synopsis

__m256d _mm256_acosh_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ACOSH(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_acosh_pd (__m512d a)

Synopsis

__m512d _mm512_acosh_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ACOSH(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_acosh_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_acosh_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ACOSH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_acosh_ps (__m128 a)

Synopsis

__m128 _mm_acosh_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ACOSH(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_acosh_ps (__m256 a)

Synopsis

__m256 _mm256_acosh_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ACOSH(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_acosh_ps (__m512 a)

Synopsis

__m512 _mm512_acosh_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ACOSH(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_acosh_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_acosh_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ACOSH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpadcd
__m512i _mm512_adc_epi32 (__m512i v2, __mmask16 k2, __m512i v3, __mmask16 * k2_res)

Synopsis

__m512i _mm512_adc_epi32 (__m512i v2, __mmask16 k2, __m512i v3, __mmask16 * k2_res)
#include "immintrin.h"
Instruction: vpadcd zmm {k}, k, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element addition of packed 32-bit integers in v2 and v3 and the corresponding bit in k2, storing the result of the addition in dst and the result of the carry in k2_res.

Operation

FOR j := 0 to 15 i := j*32 k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i] + k2[j]) dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j] ENDFOR dst[MAX:512] := 0
vpadcd
__m512i _mm512_mask_adc_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * k2_res)

Synopsis

__m512i _mm512_mask_adc_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * k2_res)
#include "immintrin.h"
Instruction: vpadcd zmm {k}, k, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element addition of packed 32-bit integers in v2 and v3 and the corresponding bit in k2, storing the result of the addition in dst and the result of the carry in k2_res using writemask k1 (elements are copied from v2 when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i] + k2[j]) dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j] ELSE dst[i+31:i] := v2[i+31:i] FI ENDFOR dst[MAX:512] := 0
paddw
__m128i _mm_add_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_add_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddw xmm, xmm
CPUID Flags: SSE2

Description

Add packed 16-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := a[i+15:i] + b[i+15:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpaddw
__m128i _mm_mask_add_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_add_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] + b[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpaddw
__m128i _mm_maskz_add_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_add_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] + b[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpaddw
__m256i _mm256_add_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_add_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 16-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := a[i+15:i] + b[i+15:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpaddw
__m256i _mm256_mask_add_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_add_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] + b[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpaddw
__m256i _mm256_maskz_add_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_add_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] + b[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpaddw
__m512i _mm512_add_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_add_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512BW

Description

Add packed 16-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 dst[i+15:i] := a[i+15:i] + b[i+15:i] ENDFOR dst[MAX:512] := 0
vpaddw
__m512i _mm512_mask_add_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_add_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512BW

Description

Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] + b[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpaddw
__m512i _mm512_maskz_add_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_add_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512BW

Description

Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] + b[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
paddd
__m128i _mm_add_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_add_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddd xmm, xmm
CPUID Flags: SSE2

Description

Add packed 32-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpaddd
__m128i _mm_mask_add_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_add_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F

Description

Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpaddd
__m128i _mm_maskz_add_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_add_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F

Description

Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpaddd
__m256i _mm256_add_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_add_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 32-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpaddd
__m256i _mm256_mask_add_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_add_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F

Description

Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpaddd
__m256i _mm256_maskz_add_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_add_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F

Description

Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpaddd
__m512i _mm512_add_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_add_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Add packed 32-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR dst[MAX:512] := 0
vpaddd
__m512i _mm512_mask_add_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_add_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpaddd
__m512i _mm512_maskz_add_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_add_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
paddq
__m128i _mm_add_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_add_epi64 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddq xmm, xmm
CPUID Flags: SSE2

Description

Add packed 64-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[i+63:i] + b[i+63:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpaddq
__m128i _mm_mask_add_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_add_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F

Description

Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpaddq
__m128i _mm_maskz_add_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_add_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F

Description

Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpaddq
__m256i _mm256_add_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_add_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 64-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[i+63:i] + b[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpaddq
__m256i _mm256_mask_add_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_add_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F

Description

Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpaddq
__m256i _mm256_maskz_add_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_add_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F

Description

Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] :=0 FI ENDFOR dst[MAX:256] := 0
vpaddq
__m512i _mm512_add_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_add_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Add packed 64-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] + b[i+63:i] ENDFOR dst[MAX:512] := 0
vpaddq
__m512i _mm512_mask_add_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_add_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpaddq
__m512i _mm512_maskz_add_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_add_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
paddb
__m128i _mm_add_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_add_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddb xmm, xmm
CPUID Flags: SSE2

Description

Add packed 8-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := a[i+7:i] + b[i+7:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpaddb
__m128i _mm_mask_add_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_add_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] + b[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpaddb
__m128i _mm_maskz_add_epi8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_add_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] + b[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpaddb
__m256i _mm256_add_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_add_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 8-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := a[i+7:i] + b[i+7:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpaddb
__m256i _mm256_mask_add_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_add_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] + b[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpaddb
__m256i _mm256_maskz_add_epi8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_add_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] + b[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpaddb
__m512i _mm512_add_epi8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_add_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512BW

Description

Add packed 8-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 63 i := j*8 dst[i+7:i] := a[i+7:i] + b[i+7:i] ENDFOR dst[MAX:512] := 0
vpaddb
__m512i _mm512_mask_add_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_add_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512BW

Description

Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] + b[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpaddb
__m512i _mm512_maskz_add_epi8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_add_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512BW

Description

Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] + b[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
addpd
__m128d _mm_add_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_add_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: addpd xmm, xmm
CPUID Flags: SSE2

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[i+63:i] + b[i+63:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vaddpd
__m128d _mm_mask_add_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_add_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vaddpd
__m128d _mm_maskz_add_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_add_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vaddpd
__m256d _mm256_add_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_add_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vaddpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[i+63:i] + b[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vaddpd
__m256d _mm256_mask_add_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_add_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vaddpd
__m256d _mm256_maskz_add_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_add_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vaddpd
__m512d _mm512_add_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_add_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vaddpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] + b[i+63:i] ENDFOR dst[MAX:512] := 0
vaddpd
__m512d _mm512_mask_add_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_add_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vaddpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vaddpd
__m512d _mm512_maskz_add_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_add_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vaddpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
addps
__m128 _mm_add_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_add_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: addps xmm, xmm
CPUID Flags: SSE

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vaddps
__m128 _mm_mask_add_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_add_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vaddps
__m128 _mm_maskz_add_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_add_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vaddps
__m256 _mm256_add_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_add_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vaddps ymm, ymm, ymm
CPUID Flags: AVX

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vaddps
__m256 _mm256_mask_add_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_add_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vaddps
__m256 _mm256_maskz_add_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_add_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vaddps
__m512 _mm512_add_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_add_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vaddps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR dst[MAX:512] := 0
vaddps
__m512 _mm512_mask_add_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_add_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vaddps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vaddps
__m512 _mm512_maskz_add_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_add_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vaddps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vaddpd
__m512d _mm512_add_round_pd (__m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_add_round_pd (__m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vaddpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] + b[i+63:i] ENDFOR dst[MAX:512] := 0
vaddpd
__m512d _mm512_mask_add_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_mask_add_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vaddpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vaddpd
__m512d _mm512_maskz_add_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_maskz_add_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vaddpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vaddps
__m512 _mm512_add_round_ps (__m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_add_round_ps (__m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vaddps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR dst[MAX:512] := 0
vaddps
__m512 _mm512_mask_add_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_mask_add_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vaddps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vaddps
__m512 _mm512_maskz_add_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_maskz_add_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vaddps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vaddsd
__m128d _mm_add_round_sd (__m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_add_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vaddsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := a[63:0] + b[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0
vaddsd
__m128d _mm_mask_add_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_mask_add_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vaddsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := a[63:0] + b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vaddsd
__m128d _mm_maskz_add_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_maskz_add_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vaddsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := a[63:0] + b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vaddss
__m128 _mm_add_round_ss (__m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_add_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vaddss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := a[31:0] + b[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0
vaddss
__m128 _mm_mask_add_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_mask_add_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vaddss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := a[31:0] + b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vaddss
__m128 _mm_maskz_add_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_maskz_add_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vaddss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := a[31:0] + b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
addsd
__m128d _mm_add_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_add_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: addsd xmm, xmm
CPUID Flags: SSE2

Description

Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := a[63:0] + b[63:0] dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vaddsd
__m128d _mm_mask_add_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_add_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vaddsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := a[63:0] + b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vaddsd
__m128d _mm_maskz_add_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_add_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vaddsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := a[63:0] + b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
paddq
__m64 _mm_add_si64 (__m64 a, __m64 b)

Synopsis

__m64 _mm_add_si64 (__m64 a, __m64 b)
#include "emmintrin.h"
Instruction: paddq mm, mm
CPUID Flags: SSE2

Description

Add 64-bit integers a and b, and store the result in dst.

Operation

dst[63:0] := a[63:0] + b[63:0]

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
addss
__m128 _mm_add_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_add_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: addss xmm, xmm
CPUID Flags: SSE

Description

Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := a[31:0] + b[31:0] dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vaddss
__m128 _mm_mask_add_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_add_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vaddss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := a[31:0] + b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vaddss
__m128 _mm_maskz_add_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_add_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vaddss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := a[31:0] + b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
adc
unsigned char _addcarry_u32 (unsigned char c_in, unsigned int a, unsigned int b, unsigned int * out)

Synopsis

unsigned char _addcarry_u32 (unsigned char c_in, unsigned int a, unsigned int b, unsigned int * out)
#include "immintrin.h"
Instruction: adc r32, r32

Description

Add unsigned 32-bit integers a and b with unsigned 8-bit carry-in c_in (carry flag), and store the unsigned 32-bit result in out, and the carry-out in dst (carry or overflow flag).

Operation

dst:out[31:0] := a[31:0] + b[31:0] + c_in;
adc
unsigned char _addcarry_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)

Synopsis

unsigned char _addcarry_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
#include "immintrin.h"
Instruction: adc r64, r64

Description

Add unsigned 64-bit integers a and b with unsigned 8-bit carry-in c_in (carry flag), and store the unsigned 64-bit result in out, and the carry-out in dst (carry or overflow flag).

Operation

dst:out[63:0] := a[63:0] + b[63:0] + c_in;
adcx, adox
unsigned char _addcarryx_u32 (unsigned char c_in, unsigned int a, unsigned int b, unsigned int * out)

Synopsis

unsigned char _addcarryx_u32 (unsigned char c_in, unsigned int a, unsigned int b, unsigned int * out)
#include "immintrin.h"
Instruction: adcx r32, r32
             adox r32, r32
CPUID Flags: ADX

Description

Add unsigned 32-bit integers a and b with unsigned 8-bit carry-in c_in (carry or overflow flag), and store the unsigned 32-bit result in out, and the carry-out in dst (carry or overflow flag).

Operation

dst:out[31:0] := a[31:0] + b[31:0] + c_in;
adcx, adox
unsigned char _addcarryx_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)

Synopsis

unsigned char _addcarryx_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
#include "immintrin.h"
Instruction: adcx r64, r64
             adox r64, r64
CPUID Flags: ADX

Description

Add unsigned 64-bit integers a and b with unsigned 8-bit carry-in c_in (carry or overflow flag), and store the unsigned 64-bit result in out, and the carry-out in dst (carry or overflow flag).

Operation

dst:out[63:0] := a[63:0] + b[63:0] + c_in;
vaddnpd
__m512d _mm512_addn_pd (__m512d v2, __m512d v3)

Synopsis

__m512d _mm512_addn_pd (__m512d v2, __m512d v3)
#include "immintrin.h"
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in v2 and v3 and negates their sum, storing the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i]) ENDFOR dst[MAX:512] := 0
vaddnpd
__m512d _mm512_mask_addn_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3)

Synopsis

__m512d _mm512_mask_addn_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3)
#include "immintrin.h"
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in v2 and v3 and negates their sum, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vaddnps
__m512 _mm512_addn_ps (__m512 v2, __m512 v3)

Synopsis

__m512 _mm512_addn_ps (__m512 v2, __m512 v3)
#include "immintrin.h"
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in v2 and v3 and negates their sum, storing the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i]) ENDFOR dst[MAX:512] := 0
vaddnps
__m512 _mm512_mask_addn_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3)

Synopsis

__m512 _mm512_mask_addn_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3)
#include "immintrin.h"
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in v2 and v3 and negates their sum, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vaddnpd
__m512d _mm512_addn_round_pd (__m512d v2, __m512d v3, int rounding)

Synopsis

__m512d _mm512_addn_round_pd (__m512d v2, __m512d v3, int rounding)
#include "immintrin.h"
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element by element addition between packed double-precision (64-bit) floating-point elements in v2 and v3 and negates the sum, storing the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i]) ENDFOR dst[MAX:512] := 0
vaddnpd
__m512d _mm512_mask_addn_round_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3, int rounding)

Synopsis

__m512d _mm512_mask_addn_round_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3, int rounding)
#include "immintrin.h"
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element by element addition between packed double-precision (64-bit) floating-point elements in v2 and v3 and negates the sum, storing the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vaddnps
__m512 _mm512_addn_round_ps (__m512 v2, __m512 v3, int rounding)

Synopsis

__m512 _mm512_addn_round_ps (__m512 v2, __m512 v3, int rounding)
#include "immintrin.h"
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element by element addition between packed single-precision (32-bit) floating-point elements in v2 and v3 and negates the sum, storing the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i]) ENDFOR dst[MAX:512] := 0
vaddnps
__m512 _mm512_mask_addn_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, int rounding)

Synopsis

__m512 _mm512_mask_addn_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, int rounding)
#include "immintrin.h"
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element by element addition between packed single-precision (32-bit) floating-point elements in v2 and v3 and negates the sum, storing the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
paddsw
__m128i _mm_adds_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_adds_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddsw xmm, xmm
CPUID Flags: SSE2

Description

Add packed 16-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] ) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpaddsw
__m128i _mm_mask_adds_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_adds_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpaddsw
__m128i _mm_maskz_adds_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_adds_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpaddsw
__m256i _mm256_adds_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_adds_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddsw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 16-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] ) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpaddsw
__m256i _mm256_mask_adds_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_adds_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpaddsw
__m256i _mm256_maskz_adds_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_adds_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpaddsw
__m512i _mm512_adds_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_adds_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512BW

Description

Add packed 16-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] ) ENDFOR dst[MAX:512] := 0
vpaddsw
__m512i _mm512_mask_adds_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_adds_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512BW

Description

Add packed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpaddsw
__m512i _mm512_maskz_adds_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_adds_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512BW

Description

Add packed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
paddsb
__m128i _mm_adds_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_adds_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddsb xmm, xmm
CPUID Flags: SSE2

Description

Add packed 8-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] ) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpaddsb
__m128i _mm_mask_adds_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_adds_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpaddsb
__m128i _mm_maskz_adds_epi8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_adds_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpaddsb
__m256i _mm256_adds_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_adds_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddsb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 8-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] ) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpaddsb
__m256i _mm256_mask_adds_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_adds_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpaddsb
__m256i _mm256_maskz_adds_epi8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_adds_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpaddsb
__m512i _mm512_adds_epi8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_adds_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512BW

Description

Add packed 8-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 63 i := j*8 dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] ) ENDFOR dst[MAX:512] := 0
vpaddsb
__m512i _mm512_mask_adds_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_adds_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512BW

Description

Add packed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpaddsb
__m512i _mm512_maskz_adds_epi8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_adds_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512BW

Description

Add packed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
paddusw
__m128i _mm_adds_epu16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_adds_epu16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddusw xmm, xmm
CPUID Flags: SSE2

Description

Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] ) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpaddusw
__m128i _mm_mask_adds_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_adds_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpaddusw
__m128i _mm_maskz_adds_epu16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_adds_epu16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpaddusw
__m256i _mm256_adds_epu16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_adds_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddusw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] ) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpaddusw
__m256i _mm256_mask_adds_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_adds_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpaddusw
__m256i _mm256_maskz_adds_epu16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_adds_epu16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpaddusw
__m512i _mm512_adds_epu16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_adds_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512BW

Description

Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] ) ENDFOR dst[MAX:512] := 0
vpaddusw
__m512i _mm512_mask_adds_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_adds_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512BW

Description

Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpaddusw
__m512i _mm512_maskz_adds_epu16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_adds_epu16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512BW

Description

Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
paddusb
__m128i _mm_adds_epu8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_adds_epu8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddusb xmm, xmm
CPUID Flags: SSE2

Description

Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] ) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpaddusb
__m128i _mm_mask_adds_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_adds_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpaddusb
__m128i _mm_maskz_adds_epu8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_adds_epu8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpaddusb
__m256i _mm256_adds_epu8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_adds_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddusb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] ) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpaddusb
__m256i _mm256_mask_adds_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_adds_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpaddusb
__m256i _mm256_maskz_adds_epu8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_adds_epu8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpaddusb
__m512i _mm512_adds_epu8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_adds_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512BW

Description

Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 63 i := j*8 dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] ) ENDFOR dst[MAX:512] := 0
vpaddusb
__m512i _mm512_mask_adds_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_adds_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512BW

Description

Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpaddusb
__m512i _mm512_maskz_adds_epu8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_adds_epu8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512BW

Description

Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpaddsetcd
__m512i _mm512_addsetc_epi32 (__m512i v2, __m512i v3, __mmask16 * k2_res)

Synopsis

__m512i _mm512_addsetc_epi32 (__m512i v2, __m512i v3, __mmask16 * k2_res)
#include "immintrin.h"
Instruction: vpaddsetcd zmm {k}, k, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element addition of packed 32-bit integer elements in v2 and v3, storing the resultant carry in k2_res (carry flag) and the addition results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := v2[i+31:i] + v3[i+31:i] k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i]) ENDFOR dst[MAX:512] := 0
vpaddsetcd
__m512i _mm512_mask_addsetc_epi32 (__m512i v2, __mmask16 k, __mmask16 k_old, __m512i v3, __mmask16 * k2_res)

Synopsis

__m512i _mm512_mask_addsetc_epi32 (__m512i v2, __mmask16 k, __mmask16 k_old, __m512i v3, __mmask16 * k2_res)
#include "immintrin.h"
Instruction: vpaddsetcd zmm {k}, k, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element addition of packed 32-bit integer elements in v2 and v3, storing the resultant carry in k2_res (carry flag) and the addition results in dst using writemask k (elements are copied from v2 and k_old when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := v2[i+31:i] + v3[i+31:i] ELSE dst[i+31:i] := v2[i+31:i] k2_res[j] := k_old[j] FI ENDFOR dst[MAX:512] := 0
vpaddsetsd
__m512i _mm512_addsets_epi32 (__m512i v2, __m512i v3, __mmask16 * sign)

Synopsis

__m512i _mm512_addsets_epi32 (__m512i v2, __m512i v3, __mmask16 * sign)
#include "immintrin.h"
Instruction: vpaddsetsd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs an element-by-element addition of packed 32-bit integer elements in v2 and v3, storing the results in dst and the sign of the sum in sign (sign flag).

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := v2[i+31:i] + v3[i+31:i] sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 ENDFOR dst[MAX:512] := 0
vpaddsetsd
__m512i _mm512_mask_addsets_epi32 (__m512i src, __mmask16 k, __m512i v2, __m512i v3, __mmask16 * sign)

Synopsis

__m512i _mm512_mask_addsets_epi32 (__m512i src, __mmask16 k, __m512i v2, __m512i v3, __mmask16 * sign)
#include "immintrin.h"
Instruction: vpaddsetsd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs an element-by-element addition of packed 32-bit integer elements in v2 and v3, storing the results in dst and the sign of the sum in sign (sign flag). Results are stored using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := v2[i+31:i] + v3[i+31:i] sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vaddsetsps
__m512 _mm512_addsets_ps (__m512 v2, __m512 v3, __mmask16 * sign)

Synopsis

__m512 _mm512_addsets_ps (__m512 v2, __m512 v3, __mmask16 * sign)
#include "immintrin.h"
Instruction: vaddsetsps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in v2 and v3, storing the results in dst and the sign of the sum in sign (sign flag).

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := v2[i+31:i] + v3[i+31:i] sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 ENDFOR dst[MAX:512] := 0
vaddsetsps
__m512 _mm512_mask_addsets_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, __mmask16 * sign)

Synopsis

__m512 _mm512_mask_addsets_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, __mmask16 * sign)
#include "immintrin.h"
Instruction: vaddsetsps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in v2 and v3, storing the results in dst and the sign of the sum in sign (sign flag). Results are stored using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := v2[i+31:i] + v3[i+31:i] sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vaddsetsps
__m512 _mm512_addsets_round_ps (__m512 v2, __m512 v3, __mmask16 * sign, int rounding)

Synopsis

__m512 _mm512_addsets_round_ps (__m512 v2, __m512 v3, __mmask16 * sign, int rounding)
#include "immintrin.h"
Instruction: vaddsetsps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in v2 and v3, storing the results in dst and the sign of the sum in sign (sign flag).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := v2[i+31:i] + v3[i+31:i] sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 ENDFOR dst[MAX:512] := 0
vaddsetsps
__m512 _mm512_mask_addsets_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, __mmask16 * sign, int rounding)

Synopsis

__m512 _mm512_mask_addsets_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, __mmask16 * sign, int rounding)
#include "immintrin.h"
Instruction: vaddsetsps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in v2 and v3, storing the results in dst and the sign of the sum in sign (sign flag). Results are stored using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := v2[i+31:i] + v3[i+31:i] sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
addsubpd
__m128d _mm_addsub_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_addsub_pd (__m128d a, __m128d b)
#include "pmmintrin.h"
Instruction: addsubpd xmm, xmm
CPUID Flags: SSE3

Description

Alternatively add and subtract packed double-precision (64-bit) floating-point elements in a to/from packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF (j is even) dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] + b[i+63:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vaddsubpd
__m256d _mm256_addsub_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_addsub_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vaddsubpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Alternatively add and subtract packed double-precision (64-bit) floating-point elements in a to/from packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF (j is even) dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] + b[i+63:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
addsubps
__m128 _mm_addsub_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_addsub_ps (__m128 a, __m128 b)
#include "pmmintrin.h"
Instruction: addsubps xmm, xmm
CPUID Flags: SSE3

Description

Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a to/from packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF (j is even) dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] + b[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vaddsubps
__m256 _mm256_addsub_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_addsub_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vaddsubps ymm, ymm, ymm
CPUID Flags: AVX

Description

Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a to/from packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF (j is even) dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] + b[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
aesdec
__m128i _mm_aesdec_si128 (__m128i a, __m128i RoundKey)

Synopsis

__m128i _mm_aesdec_si128 (__m128i a, __m128i RoundKey)
#include "wmmintrin.h"
Instruction: aesdec xmm, xmm
CPUID Flags: AES

Description

Perform one round of an AES decryption flow on data (state) in a using the round key in RoundKey, and store the result in dst."

Operation

state := a a[127:0] := InvShiftRows(a[127:0]) a[127:0] := InvSubBytes(a[127:0]) a[127:0] := InvMixColumns(a[127:0]) dst[127:0] := a[127:0] XOR RoundKey[127:0]

Performance

ArchitectureLatencyThroughput
Haswell71
Ivy Bridge71
Sandy Bridge71
Westmere62
aesdeclast
__m128i _mm_aesdeclast_si128 (__m128i a, __m128i RoundKey)

Synopsis

__m128i _mm_aesdeclast_si128 (__m128i a, __m128i RoundKey)
#include "wmmintrin.h"
Instruction: aesdeclast xmm, xmm
CPUID Flags: AES

Description

Perform the last round of an AES decryption flow on data (state) in a using the round key in RoundKey, and store the result in dst."

Operation

state := a a[127:0] := InvShiftRows(a[127:0]) a[127:0] := InvSubBytes(a[127:0]) dst[127:0] := a[127:0] XOR RoundKey[127:0]

Performance

ArchitectureLatencyThroughput
Haswell71
Ivy Bridge71
Sandy Bridge71
Westmere62
aesenc
__m128i _mm_aesenc_si128 (__m128i a, __m128i RoundKey)

Synopsis

__m128i _mm_aesenc_si128 (__m128i a, __m128i RoundKey)
#include "wmmintrin.h"
Instruction: aesenc xmm, xmm
CPUID Flags: AES

Description

Perform one round of an AES encryption flow on data (state) in a using the round key in RoundKey, and store the result in dst."

Operation

state := a a[127:0] := ShiftRows(a[127:0]) a[127:0] := SubBytes(a[127:0]) a[127:0] := MixColumns(a[127:0]) dst[127:0] := a[127:0] XOR RoundKey[127:0]

Performance

ArchitectureLatencyThroughput
Haswell71
Ivy Bridge71
Sandy Bridge71
Westmere62
aesenclast
__m128i _mm_aesenclast_si128 (__m128i a, __m128i RoundKey)

Synopsis

__m128i _mm_aesenclast_si128 (__m128i a, __m128i RoundKey)
#include "wmmintrin.h"
Instruction: aesenclast xmm, xmm
CPUID Flags: AES

Description

Perform the last round of an AES encryption flow on data (state) in a using the round key in RoundKey, and store the result in dst."

Operation

state := a a[127:0] := ShiftRows(a[127:0]) a[127:0] := SubBytes(a[127:0]) dst[127:0] := a[127:0] XOR RoundKey[127:0]

Performance

ArchitectureLatencyThroughput
Haswell71
Ivy Bridge71
Sandy Bridge71
Westmere62
aesimc
__m128i _mm_aesimc_si128 (__m128i a)

Synopsis

__m128i _mm_aesimc_si128 (__m128i a)
#include "wmmintrin.h"
Instruction: aesimc xmm, xmm
CPUID Flags: AES

Description

Perform the InvMixColumns transformation on a and store the result in dst.

Operation

dst[127:0] := InvMixColumns(a[127:0])

Performance

ArchitectureLatencyThroughput
Haswell142
Ivy Bridge122
Sandy Bridge122
Westmere62
aeskeygenassist
__m128i _mm_aeskeygenassist_si128 (__m128i a, const int imm8)

Synopsis

__m128i _mm_aeskeygenassist_si128 (__m128i a, const int imm8)
#include "wmmintrin.h"
Instruction: aeskeygenassist xmm, xmm, imm
CPUID Flags: AES

Description

Assist in expanding the AES cipher key by computing steps towards generating a round key for encryption cipher using data from a and an 8-bit round constant specified in imm8, and store the result in dst."

Operation

X3[31:0] := a[127:96] X2[31:0] := a[95:64] X1[31:0] := a[63:32] X0[31:0] := a[31:0] RCON[31:0] := ZeroExtend(imm8[7:0]); dst[31:0] := SubWord(X1) dst[63:32] := (RotWord(SubWord(X1)) XOR RCON; dst[95:64] := SubWord(X3) dst[127:96] := RotWord(SubWord(X3)) XOR RCON;

Performance

ArchitectureLatencyThroughput
Haswell102
Ivy Bridge102
Sandy Bridge102
Westmere62
valignd
__m128i _mm_alignr_epi32 (__m128i a, __m128i b, const int count)

Synopsis

__m128i _mm_alignr_epi32 (__m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: valignd
CPUID Flags: AVX512F + AVX512VL

Description

Concatenate a and b into a 32-byte immediate result, shift the result right by count 32-bit elements, and store the low 16 bytes (4 elements) in dst.

Operation

temp[255:128] := a[127:0] temp[127:0] := b[127:0] temp[255:0] := temp[255:0] >> (32*count) dst[127:0] := temp[127:0] dst[MAX:128] := 0
valignd
__m128i _mm_mask_alignr_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b, const int count)

Synopsis

__m128i _mm_mask_alignr_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: valignd
CPUID Flags: AVX512F + AVX512VL

Description

Concatenate a and b into a 32-byte immediate result, shift the result right by count 32-bit elements, and store the low 16 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

temp[255:128] := a[127:0] temp[127:0] := b[127:0] temp[255:0] := temp[255:0] >> (32*count) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := temp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
valignd
__m128i _mm_maskz_alignr_epi32 (__mmask8 k, __m128i a, __m128i b, const int count)

Synopsis

__m128i _mm_maskz_alignr_epi32 (__mmask8 k, __m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: valignd
CPUID Flags: AVX512F + AVX512VL

Description

Concatenate a and b into a 32-byte immediate result, shift the result right by count 32-bit elements, and store the low 16 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

temp[255:128] := a[127:0] temp[127:0] := b[127:0] temp[255:0] := temp[255:0] >> (32*count) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := temp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
valignd
__m256i _mm256_alignr_epi32 (__m256i a, __m256i b, const int count)

Synopsis

__m256i _mm256_alignr_epi32 (__m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: valignd
CPUID Flags: AVX512F + AVX512VL

Description

Concatenate a and b into a 64-byte immediate result, shift the result right by count 32-bit elements, and store the low 32 bytes (8 elements) in dst.

Operation

temp[511:256] := a[255:0] temp[255:0] := b[255:0] temp[511:0] := temp[511:0] >> (32*count) dst[255:0] := temp[255:0] dst[MAX:256] := 0
valignd
__m256i _mm256_mask_alignr_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int count)

Synopsis

__m256i _mm256_mask_alignr_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: valignd
CPUID Flags: AVX512F + AVX512VL

Description

Concatenate a and b into a 64-byte immediate result, shift the result right by count 32-bit elements, and store the low 32 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

temp[511:256] := a[255:0] temp[255:0] := b[255:0] temp[511:0] := temp[511:0] >> (32*count) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := temp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
valignd
__m256i _mm256_maskz_alignr_epi32 (__mmask8 k, __m256i a, __m256i b, const int count)

Synopsis

__m256i _mm256_maskz_alignr_epi32 (__mmask8 k, __m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: valignd
CPUID Flags: AVX512F + AVX512VL

Description

Concatenate a and b into a 64-byte immediate result, shift the result right by count 32-bit elements, and store the low 32 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

temp[511:256] := a[255:0] temp[255:0] := b[255:0] temp[511:0] := temp[511:0] >> (32*count) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := temp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
valignd
__m512i _mm512_alignr_epi32 (__m512i a, __m512i b, const int count)

Synopsis

__m512i _mm512_alignr_epi32 (__m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: valignd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Concatenate a and b into a 128-byte immediate result, shift the result right by count 32-bit elements, and store the low 64 bytes (16 elements) in dst.

Operation

temp[1023:512] := a[511:0] temp[511:0] := b[511:0] temp[1023:0] := temp[1023:0] >> (32*count) dst[511:0] := temp[511:0] dst[MAX:512] := 0
valignd
__m512i _mm512_mask_alignr_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b, const int count)

Synopsis

__m512i _mm512_mask_alignr_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: valignd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Concatenate a and b into a 128-byte immediate result, shift the result right by count 32-bit elements, and store the low 64 bytes (16 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

temp[1023:512] := a[511:0] temp[511:0] := b[511:0] temp[1023:0] := temp[1023:0] >> (32*count) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := temp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
valignd
__m512i _mm512_maskz_alignr_epi32 (__mmask16 k, __m512i a, __m512i b, const int count)

Synopsis

__m512i _mm512_maskz_alignr_epi32 (__mmask16 k, __m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: valignd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Concatenate a and b into a 128-byte immediate result, shift the result right by count 32-bit elements, and stores the low 64 bytes (16 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

temp[1023:512] := a[511:0] temp[511:0] := b[511:0] temp[1023:0] := temp[1023:0] >> (32*count) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := temp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
valignq
__m128i _mm_alignr_epi64 (__m128i a, __m128i b, const int count)

Synopsis

__m128i _mm_alignr_epi64 (__m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: valignq
CPUID Flags: AVX512F + AVX512VL

Description

Concatenate a and b into a 32-byte immediate result, shift the result right by count 64-bit elements, and store the low 16 bytes (2 elements) in dst.

Operation

temp[255:128] := a[127:0] temp[127:0] := b[127:0] temp[255:0] := temp[255:0] >> (64*count) dst[127:0] := temp[127:0] dst[MAX:128] := 0
valignq
__m128i _mm_mask_alignr_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b, const int count)

Synopsis

__m128i _mm_mask_alignr_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: valignq
CPUID Flags: AVX512F + AVX512VL

Description

Concatenate a and b into a 32-byte immediate result, shift the result right by count 64-bit elements, and store the low 16 bytes (2 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

temp[255:128] := a[127:0] temp[127:0] := b[127:0] temp[255:0] := temp[255:0] >> (64*count) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := temp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
valignq
__m128i _mm_maskz_alignr_epi64 (__mmask8 k, __m128i a, __m128i b, const int count)

Synopsis

__m128i _mm_maskz_alignr_epi64 (__mmask8 k, __m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: valignq
CPUID Flags: AVX512F + AVX512VL

Description

Concatenate a and b into a 32-byte immediate result, shift the result right by count 64-bit elements, and store the low 16 bytes (2 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

temp[255:128] := a[127:0] temp[127:0] := b[127:0] temp[255:0] := temp[255:0] >> (64*count) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := temp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
valignq
__m256i _mm256_alignr_epi64 (__m256i a, __m256i b, const int count)

Synopsis

__m256i _mm256_alignr_epi64 (__m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: valignq
CPUID Flags: AVX512F + AVX512VL

Description

Concatenate a and b into a 64-byte immediate result, shift the result right by count 64-bit elements, and store the low 32 bytes (4 elements) in dst.

Operation

temp[511:256] := a[255:0] temp[255:0] := b[255:0] temp[511:0] := temp[511:0] >> (64*count) dst[255:0] := temp[255:0] dst[MAX:256] := 0
valignq
__m256i _mm256_mask_alignr_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int count)

Synopsis

__m256i _mm256_mask_alignr_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: valignq
CPUID Flags: AVX512F + AVX512VL

Description

Concatenate a and b into a 64-byte immediate result, shift the result right by count 64-bit elements, and store the low 32 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

temp[511:256] := a[255:0] temp[255:0] := b[255:0] temp[511:0] := temp[511:0] >> (64*count) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := temp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
valignq
__m256i _mm256_maskz_alignr_epi64 (__mmask8 k, __m256i a, __m256i b, const int count)

Synopsis

__m256i _mm256_maskz_alignr_epi64 (__mmask8 k, __m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: valignq
CPUID Flags: AVX512F + AVX512VL

Description

Concatenate a and b into a 64-byte immediate result, shift the result right by count 64-bit elements, and store the low 32 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

temp[511:256] := a[255:0] temp[255:0] := b[255:0] temp[511:0] := temp[511:0] >> (64*count) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := temp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
valignq
__m512i _mm512_alignr_epi64 (__m512i a, __m512i b, const int count)

Synopsis

__m512i _mm512_alignr_epi64 (__m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: valignq zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Concatenate a and b into a 128-byte immediate result, shift the result right by count 64-bit elements, and store the low 64 bytes (8 elements) in dst.

Operation

temp[1023:512] := a[511:0] temp[511:0] := b[511:0] temp[1023:0] := temp[1023:0] >> (64*count) dst[511:0] := temp[511:0] dst[MAX:512] := 0
valignq
__m512i _mm512_mask_alignr_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b, const int count)

Synopsis

__m512i _mm512_mask_alignr_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: valignq zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Concatenate a and b into a 128-byte immediate result, shift the result right by count 64-bit elements, and store the low 64 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

temp[1023:512] := a[511:0] temp[511:0] := b[511:0] temp[1023:0] := temp[1023:0] >> (64*count) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := temp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
valignq
__m512i _mm512_maskz_alignr_epi64 (__mmask8 k, __m512i a, __m512i b, const int count)

Synopsis

__m512i _mm512_maskz_alignr_epi64 (__mmask8 k, __m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: valignq zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Concatenate a and b into a 128-byte immediate result, shift the result right by count 64-bit elements, and stores the low 64 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

temp[1023:512] := a[511:0] temp[511:0] := b[511:0] temp[1023:0] := temp[1023:0] >> (64*count) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := temp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
palignr
__m128i _mm_alignr_epi8 (__m128i a, __m128i b, int count)

Synopsis

__m128i _mm_alignr_epi8 (__m128i a, __m128i b, int count)
#include "tmmintrin.h"
Instruction: palignr xmm, xmm, imm
CPUID Flags: SSSE3

Description

Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst.

Operation

tmp[255:0] := ((a[127:0] << 128) OR b[127:0]) >> (count[7:0]*8) dst[127:0] := tmp[127:0]

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpalignr
__m128i _mm_mask_alignr_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b, const int count)

Synopsis

__m128i _mm_mask_alignr_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512VL + AVX512BW

Description

Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[255:0] := ((a[127:0] << 128) OR b[127:0]) >> (count[7:0]*8) FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpalignr
__m128i _mm_maskz_alignr_epi8 (__mmask16 k, __m128i a, __m128i b, const int count)

Synopsis

__m128i _mm_maskz_alignr_epi8 (__mmask16 k, __m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512VL + AVX512BW

Description

Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[255:0] := ((a[127:0] << 128) OR b[127:0]) >> (count[7:0]*8) FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpalignr
__m256i _mm256_alignr_epi8 (__m256i a, __m256i b, const int count)

Synopsis

__m256i _mm256_alignr_epi8 (__m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: vpalignr ymm, ymm, ymm, imm
CPUID Flags: AVX2

Description

Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst.

Operation

FOR j := 0 to 1 i := j*128 tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8) dst[i+127:i] := tmp[127:0] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpalignr
__m256i _mm256_mask_alignr_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b, const int count)

Synopsis

__m256i _mm256_mask_alignr_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512VL + AVX512BW

Description

Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*128 tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8) tmp_dst[i+127:i] := tmp[127:0] ENDFOR FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpalignr
__m256i _mm256_maskz_alignr_epi8 (__mmask32 k, __m256i a, __m256i b, const int count)

Synopsis

__m256i _mm256_maskz_alignr_epi8 (__mmask32 k, __m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512VL + AVX512BW

Description

Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*128 tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8) tmp_dst[i+127:i] := tmp[127:0] ENDFOR FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpalignr
__m512i _mm512_alignr_epi8 (__m512i a, __m512i b, const int count)

Synopsis

__m512i _mm512_alignr_epi8 (__m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512BW

Description

Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst.

Operation

FOR j := 0 to 3 i := j*128 tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8) dst[i+127:i] := tmp[127:0] ENDFOR dst[MAX:512] := 0
vpalignr
__m512i _mm512_mask_alignr_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b, const int count)

Synopsis

__m512i _mm512_mask_alignr_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512BW

Description

Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*128 tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8) tmp_dst[i+127:i] := tmp[127:0] ENDFOR FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpalignr
__m512i _mm512_maskz_alignr_epi8 (__mmask64 k, __m512i a, __m512i b, const int count)

Synopsis

__m512i _mm512_maskz_alignr_epi8 (__mmask64 k, __m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512BW

Description

Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*128 tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8) tmp_dst[i+127:i] := tmp[127:0] ENDFOR FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
palignr
__m64 _mm_alignr_pi8 (__m64 a, __m64 b, int count)

Synopsis

__m64 _mm_alignr_pi8 (__m64 a, __m64 b, int count)
#include "tmmintrin.h"
Instruction: palignr mm, mm, imm
CPUID Flags: SSSE3

Description

Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst.

Operation

tmp[127:0] := ((a[63:0] << 64) OR b[63:0]) >> (count[7:0]*8) dst[63:0] := tmp[63:0]
...
void _allow_cpu_features (unsigned __int64 a)

Synopsis

void _allow_cpu_features (unsigned __int64 a)
#include "immintrin.h"

Description

Treat the processor-specific feature(s) specified in a as available. Multiple features may be OR'd together. See the valid feature flags below:

Operation

_FEATURE_GENERIC_IA32 _FEATURE_FPU _FEATURE_CMOV _FEATURE_MMX _FEATURE_FXSAVE _FEATURE_SSE _FEATURE_SSE2 _FEATURE_SSE3 _FEATURE_SSSE3 _FEATURE_SSE4_1 _FEATURE_SSE4_2 _FEATURE_MOVBE _FEATURE_POPCNT _FEATURE_PCLMULQDQ _FEATURE_AES _FEATURE_F16C _FEATURE_AVX _FEATURE_RDRND _FEATURE_FMA _FEATURE_BMI _FEATURE_LZCNT _FEATURE_HLE _FEATURE_RTM _FEATURE_AVX2 _FEATURE_KNCNI _FEATURE_AVX512F _FEATURE_ADX _FEATURE_RDSEED _FEATURE_AVX512ER _FEATURE_AVX512PF _FEATURE_AVX512CD _FEATURE_SHA _FEATURE_MPX
vpandd
__m128i _mm_mask_and_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_and_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] AND b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpandd
__m128i _mm_maskz_and_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_and_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] AND b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpandd
__m256i _mm256_mask_and_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_and_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpandd
__m256i _mm256_maskz_and_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_and_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpandd
__m512i _mm512_and_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_and_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i] ENDFOR dst[MAX:512] := 0
vpandd
__m512i _mm512_mask_and_epi32 (__m512i src, __mmask16 k, __m512i v2, __m512i v3)

Synopsis

__m512i _mm512_mask_and_epi32 (__m512i src, __mmask16 k, __m512i v2, __m512i v3)
#include "immintrin.h"
Instruction: vpandd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Performs element-by-element bitwise AND between packed 32-bit integer elements of v2 and v3, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := v2[i+31:i] & v3[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpandd
__m512i _mm512_maskz_and_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_and_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] AND b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpandq
__m128i _mm_mask_and_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_and_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpandq
__m128i _mm_maskz_and_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_and_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpandq
__m256i _mm256_mask_and_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_and_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpandq
__m256i _mm256_maskz_and_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_and_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpandq
__m512i _mm512_and_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_and_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst.

Operation

dst[511:0] := (a[511:0] AND b[511:0]) dst[MAX:512] := 0
vpandq
__m512i _mm512_mask_and_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_and_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpandq
__m512i _mm512_maskz_and_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_and_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
andpd
__m128d _mm_and_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_and_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: andpd xmm, xmm
CPUID Flags: SSE2

Description

Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.8
Ivy Bridge11
Sandy Bridge11
Westmere10.33
Nehalem10.33
vandpd
__m128d _mm_mask_and_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_and_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vandpd
__m128d _mm_maskz_and_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_and_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vandpd
__m256d _mm256_and_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_and_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vandpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vandpd
__m256d _mm256_mask_and_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_and_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vandpd
__m256d _mm256_maskz_and_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_and_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vandpd
__m512d _mm512_and_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_and_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512DQ

Description

Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ENDFOR dst[MAX:512] := 0
vandpd
__m512d _mm512_mask_and_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_and_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512DQ

Description

Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vandpd
__m512d _mm512_maskz_and_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_and_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512DQ

Description

Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
andps
__m128 _mm_and_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_and_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: andps xmm, xmm
CPUID Flags: SSE

Description

Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere10.33
Nehalem10.33
vandps
__m128 _mm_mask_and_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_and_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vandps
__m128 _mm_maskz_and_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_and_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vandps
__m256 _mm256_and_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_and_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vandps ymm, ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vandps
__m256 _mm256_mask_and_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_and_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vandps
__m256 _mm256_maskz_and_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_and_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vandps
__m512 _mm512_and_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_and_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512DQ

Description

Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ENDFOR dst[MAX:512] := 0
vandps
__m512 _mm512_mask_and_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_and_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512DQ

Description

Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vandps
__m512 _mm512_maskz_and_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_and_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512DQ

Description

Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
pand
__m128i _mm_and_si128 (__m128i a, __m128i b)

Synopsis

__m128i _mm_and_si128 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pand xmm, xmm
CPUID Flags: SSE2

Description

Compute the bitwise AND of 128 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[127:0] := (a[127:0] AND b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vpand
__m256i _mm256_and_si256 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_and_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpand ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compute the bitwise AND of 256 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[255:0] := (a[255:0] AND b[255:0]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpandd
__m512i _mm512_and_si512 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_and_si512 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise AND of 512 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[511:0] := (a[511:0] AND b[511:0]) dst[MAX:512] := 0
vpandnd
__m128i _mm_mask_andnot_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_andnot_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandnd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpandnd
__m128i _mm_maskz_andnot_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_andnot_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandnd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpandnd
__m256i _mm256_mask_andnot_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_andnot_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandnd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpandnd
__m256i _mm256_maskz_andnot_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_andnot_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandnd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpandnd
__m512i _mm512_andnot_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_andnot_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] ENDFOR dst[MAX:512] := 0
vpandnd
__m512i _mm512_mask_andnot_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_andnot_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpandnd
__m512i _mm512_maskz_andnot_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_andnot_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpandnq
__m128i _mm_mask_andnot_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_andnot_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandnq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND NOT of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpandnq
__m128i _mm_maskz_andnot_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_andnot_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandnq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND NOT of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpandnq
__m256i _mm256_mask_andnot_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_andnot_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandnq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND NOT of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpandnq
__m256i _mm256_maskz_andnot_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_andnot_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandnq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND NOT of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpandnq
__m512i _mm512_andnot_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_andnot_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise AND NOT of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst.

Operation

dst[511:0] := ((NOT a[511:0]) AND b[511:0]) dst[MAX:512] := 0
vpandnq
__m512i _mm512_mask_andnot_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_andnot_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise AND NOT of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpandnq
__m512i _mm512_maskz_andnot_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_andnot_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise AND NOT of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
andnpd
__m128d _mm_andnot_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_andnot_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: andnpd xmm, xmm
CPUID Flags: SSE2

Description

Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.8
Ivy Bridge11
Sandy Bridge11
Westmere10.33
Nehalem10.33
vandnpd
__m128d _mm_mask_andnot_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_andnot_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vandnpd
__m128d _mm_maskz_andnot_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_andnot_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vandnpd
__m256d _mm256_andnot_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_andnot_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vandnpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vandnpd
__m256d _mm256_mask_andnot_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_andnot_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vandnpd
__m256d _mm256_maskz_andnot_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_andnot_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vandnpd
__m512d _mm512_andnot_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_andnot_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512DQ

Description

Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ENDFOR dst[MAX:512] := 0
vandnpd
__m512d _mm512_mask_andnot_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_andnot_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512DQ

Description

Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vandnpd
__m512d _mm512_maskz_andnot_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_andnot_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512DQ

Description

Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
andnps
__m128 _mm_andnot_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_andnot_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: andnps xmm, xmm
CPUID Flags: SSE

Description

Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere10.33
Nehalem10.33
vandnps
__m128 _mm_mask_andnot_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_andnot_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vandnps
__m128 _mm_maskz_andnot_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_andnot_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vandnps
__m256 _mm256_andnot_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_andnot_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vandnps ymm, ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vandnps
__m256 _mm256_mask_andnot_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_andnot_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vandnps
__m256 _mm256_maskz_andnot_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_andnot_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vandnps
__m512 _mm512_andnot_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_andnot_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512DQ

Description

Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ENDFOR dst[MAX:512] := 0
vandnps
__m512 _mm512_mask_andnot_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_andnot_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512DQ

Description

Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vandnps
__m512 _mm512_maskz_andnot_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_andnot_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512DQ

Description

Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
pandn
__m128i _mm_andnot_si128 (__m128i a, __m128i b)

Synopsis

__m128i _mm_andnot_si128 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pandn xmm, xmm
CPUID Flags: SSE2

Description

Compute the bitwise AND NOT of 128 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[127:0] := ((NOT a[127:0]) AND b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vpandn
__m256i _mm256_andnot_si256 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_andnot_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandn ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compute the bitwise AND NOT of 256 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[255:0] := ((NOT a[255:0]) AND b[255:0]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpandnd
__m512i _mm512_andnot_si512 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_andnot_si512 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise AND NOT of 512 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[511:0] := ((NOT a[511:0]) AND b[511:0]) dst[MAX:512] := 0
...
__m128d _mm_asin_pd (__m128d a)

Synopsis

__m128d _mm_asin_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ASIN(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_asin_pd (__m256d a)

Synopsis

__m256d _mm256_asin_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ASIN(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_asin_pd (__m512d a)

Synopsis

__m512d _mm512_asin_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ASIN(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_asin_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_asin_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ASIN(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_asin_ps (__m128 a)

Synopsis

__m128 _mm_asin_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ASIN(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_asin_ps (__m256 a)

Synopsis

__m256 _mm256_asin_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ASIN(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_asin_ps (__m512 a)

Synopsis

__m512 _mm512_asin_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ASIN(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_asin_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_asin_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ASIN(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_asinh_pd (__m128d a)

Synopsis

__m128d _mm_asinh_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ASINH(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_asinh_pd (__m256d a)

Synopsis

__m256d _mm256_asinh_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ASINH(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_asinh_pd (__m512d a)

Synopsis

__m512d _mm512_asinh_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ASINH(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_asinh_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_asinh_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ASINH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_asinh_ps (__m128 a)

Synopsis

__m128 _mm_asinh_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ASINH(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_asinh_ps (__m256 a)

Synopsis

__m256 _mm256_asinh_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ASINH(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_asinh_ps (__m512 a)

Synopsis

__m512 _mm512_asinh_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ASINH(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_asinh_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_asinh_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ASINH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_atan_pd (__m128d a)

Synopsis

__m128d _mm_atan_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ATAN(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_atan_pd (__m256d a)

Synopsis

__m256d _mm256_atan_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ATAN(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_atan_pd (__m512d a)

Synopsis

__m512d _mm512_atan_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a and store the results in dst expressed in radians.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ATAN(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_atan_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_atan_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a, and store the results in dst expressed in radians using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ATAN(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_atan_ps (__m128 a)

Synopsis

__m128 _mm_atan_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ATAN(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_atan_ps (__m256 a)

Synopsis

__m256 _mm256_atan_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ATAN(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_atan_ps (__m512 a)

Synopsis

__m512 _mm512_atan_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a, and store the results in dst expressed in radians.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ATAN(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_atan_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_atan_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ATAN(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_atan2_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_atan2_pd (__m128d a, __m128d b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_atan2_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_atan2_pd (__m256d a, __m256d b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_atan2_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_atan2_pd (__m512d a, __m512d b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_atan2_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_atan2_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to i := j*64 IF k[j] dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_atan2_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_atan2_ps (__m128 a, __m128 b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_atan2_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_atan2_ps (__m256 a, __m256 b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_atan2_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_atan2_ps (__m512 a, __m512 b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_atan2_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_atan2_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_atanh_pd (__m128d a)

Synopsis

__m128d _mm_atanh_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ATANH(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_atanh_pd (__m256d a)

Synopsis

__m256d _mm256_atanh_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ATANH(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_atanh_pd (__m512d a)

Synopsis

__m512d _mm512_atanh_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a and store the results in dst expressed in radians.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ATANH(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_atanh_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_atanh_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a, and store the results in dst expressed in radians using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ATANH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_atanh_ps (__m128 a)

Synopsis

__m128 _mm_atanh_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ATANH(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_atanh_ps (__m256 a)

Synopsis

__m256 _mm256_atanh_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ATANH(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_atanh_ps (__m512 a)

Synopsis

__m512 _mm512_atanh_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse hyperblic tangent of packed single-precision (32-bit) floating-point elements in a, and store the results in dst expressed in radians.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ATANH(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_atanh_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_atanh_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ATANH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
pavgw
__m128i _mm_avg_epu16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_avg_epu16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pavgw xmm, xmm
CPUID Flags: SSE2

Description

Average packed unsigned 16-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpavgw
__m128i _mm_mask_avg_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_avg_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512VL + AVX512BW

Description

Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpavgw
__m128i _mm_maskz_avg_epu16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_avg_epu16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512VL + AVX512BW

Description

Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpavgw
__m256i _mm256_avg_epu16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_avg_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpavgw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Average packed unsigned 16-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpavgw
__m256i _mm256_mask_avg_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_avg_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512VL + AVX512BW

Description

Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpavgw
__m256i _mm256_maskz_avg_epu16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_avg_epu16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512VL + AVX512BW

Description

Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpavgw
__m512i _mm512_avg_epu16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_avg_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512BW

Description

Average packed unsigned 16-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ENDFOR dst[MAX:512] := 0
vpavgw
__m512i _mm512_mask_avg_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_avg_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512BW

Description

Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpavgw
__m512i _mm512_maskz_avg_epu16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_avg_epu16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512BW

Description

Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
pavgb
__m128i _mm_avg_epu8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_avg_epu8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pavgb xmm, xmm
CPUID Flags: SSE2

Description

Average packed unsigned 8-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpavgb
__m128i _mm_mask_avg_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_avg_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512VL + AVX512BW

Description

Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpavgb
__m128i _mm_maskz_avg_epu8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_avg_epu8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512VL + AVX512BW

Description

Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpavgb
__m256i _mm256_avg_epu8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_avg_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpavgb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Average packed unsigned 8-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpavgb
__m256i _mm256_mask_avg_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_avg_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512VL + AVX512BW

Description

Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpavgb
__m256i _mm256_maskz_avg_epu8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_avg_epu8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512VL + AVX512BW

Description

Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpavgb
__m512i _mm512_avg_epu8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_avg_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512BW

Description

Average packed unsigned 8-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 63 i := j*8 dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ENDFOR dst[MAX:512] := 0
vpavgb
__m512i _mm512_mask_avg_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_avg_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512BW

Description

Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpavgb
__m512i _mm512_maskz_avg_epu8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_avg_epu8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512BW

Description

Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
pavgw
__m64 _mm_avg_pu16 (__m64 a, __m64 b)

Synopsis

__m64 _mm_avg_pu16 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pavgw mm, mm
CPUID Flags: SSE

Description

Average packed unsigned 16-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*16 dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ENDFOR
pavgb
__m64 _mm_avg_pu8 (__m64 a, __m64 b)

Synopsis

__m64 _mm_avg_pu8 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pavgb mm, mm
CPUID Flags: SSE

Description

Average packed unsigned 8-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*8 dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ENDFOR
bextr
unsigned int _bextr_u32 (unsigned int a, unsigned int start, unsigned int len)

Synopsis

unsigned int _bextr_u32 (unsigned int a, unsigned int start, unsigned int len)
#include "immintrin.h"
Instruction: bextr r32, r32, r32
CPUID Flags: BMI1

Description

Extract contiguous bits from unsigned 32-bit integer a, and store the result in dst. Extract the number of bits specified by len, starting at the bit specified by start.

Operation

tmp := ZERO_EXTEND_TO_512(a) dst := ZERO_EXTEND(tmp[start+len-1:start])

Performance

ArchitectureLatencyThroughput
Haswell2-
bextr
unsigned __int64 _bextr_u64 (unsigned __int64 a, unsigned int start, unsigned int len)

Synopsis

unsigned __int64 _bextr_u64 (unsigned __int64 a, unsigned int start, unsigned int len)
#include "immintrin.h"
Instruction: bextr r64, r64, r64
CPUID Flags: BMI1

Description

Extract contiguous bits from unsigned 64-bit integer a, and store the result in dst. Extract the number of bits specified by len, starting at the bit specified by start.

Operation

tmp := ZERO_EXTEND_TO_512(a) dst := ZERO_EXTEND(tmp[start+len-1:start])

Performance

ArchitectureLatencyThroughput
Haswell2-
bsf
int _bit_scan_forward (int a)

Synopsis

int _bit_scan_forward (int a)
#include "immintrin.h"
Instruction: bsf r32, r32

Description

Set dst to the index of the lowest set bit in 32-bit integer a. If no bits are set in a then dst is undefined.

Operation

tmp := 0 IF a = 0 dst := undefined ELSE DO WHILE ((tmp < 32) AND a[tmp] = 0) tmp := tmp + 1 dst := tmp OD FI

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
bsr
int _bit_scan_reverse (int a)

Synopsis

int _bit_scan_reverse (int a)
#include "immintrin.h"
Instruction: bsr r32, r32

Description

Set dst to the index of the highest set bit in 32-bit integer a. If no bits are set in a then dst is undefined.

Operation

tmp := 31 IF a = 0 dst := undefined ELSE DO WHILE ((tmp > 0) AND a[tmp] = 0) tmp := tmp - 1 dst := tmp OD FI

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
bsf
unsigned char _BitScanForward (unsigned __int32* index, unsigned __int32 mask)

Synopsis

unsigned char _BitScanForward (unsigned __int32* index, unsigned __int32 mask)
#include "immintrin.h"
Instruction: bsf r32, r32

Description

Set index to the index of the lowest set bit in 32-bit integer mask. If no bits are set in mask, then set dst to 0, otherwise set dst to 1.

Operation

tmp := 0 IF mask = 0 dst := 0 ELSE DO WHILE ((tmp < 32) AND mask[tmp] = 0) tmp := tmp + 1 index := tmp dst := 1 OD FI

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
bsf
unsigned char _BitScanForward64 (unsigned __int32* index, unsigned __int64 mask)

Synopsis

unsigned char _BitScanForward64 (unsigned __int32* index, unsigned __int64 mask)
#include "immintrin.h"
Instruction: bsf r64, r64

Description

Set index to the index of the lowest set bit in 64-bit integer mask. If no bits are set in mask, then set dst to 0, otherwise set dst to 1.

Operation

tmp := 0 IF mask = 0 dst := 0 ELSE DO WHILE ((tmp < 64) AND mask[tmp] = 0) tmp := tmp + 1 index := tmp dst := 1 OD FI

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
bsr
unsigned char _BitScanReverse (unsigned __int32* index, unsigned __int32 mask)

Synopsis

unsigned char _BitScanReverse (unsigned __int32* index, unsigned __int32 mask)
#include "immintrin.h"
Instruction: bsr r32, r32

Description

Set index to the index of the highest set bit in 32-bit integer mask. If no bits are set in mask, then set dst to 0, otherwise set dst to 1.

Operation

tmp := 31 IF mask = 0 dst := 0 ELSE DO WHILE ((tmp > 0) AND mask[tmp] = 0) tmp := tmp - 1 index := tmp dst := 1 OD FI

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
bsr
unsigned char _BitScanReverse64 (unsigned __int32* index, unsigned __int64 mask)

Synopsis

unsigned char _BitScanReverse64 (unsigned __int32* index, unsigned __int64 mask)
#include "immintrin.h"
Instruction: bsr r64, r64

Description

Set index to the index of the highest set bit in 64-bit integer mask. If no bits are set in mask, then set dst to 0, otherwise set dst to 1.

Operation

tmp := 31 IF mask = 0 dst := 0 ELSE DO WHILE ((tmp > 0) AND mask[tmp] = 0) tmp := tmp - 1 index := tmp dst := 1 OD FI

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
bt
unsigned char _bittest (__int32* a, __int32 b)

Synopsis

unsigned char _bittest (__int32* a, __int32 b)
#include "immintrin.h"
Instruction: bt r32, r32

Description

Return the bit at index b of 32-bit integer a.

Operation

dst := a[b]
bt
unsigned char _bittest64 (__int64* a, __int64 b)

Synopsis

unsigned char _bittest64 (__int64* a, __int64 b)
#include "immintrin.h"
Instruction: bt r64, r64

Description

Return the bit at index b of 64-bit integer a.

Operation

dst := a[b]
btc
unsigned char _bittestandcomplement (__int32* a, __int32 b)

Synopsis

unsigned char _bittestandcomplement (__int32* a, __int32 b)
#include "immintrin.h"
Instruction: btc r32, r32

Description

Return the bit at index b of 32-bit integer a, and set that bit to its complement.

Operation

dst := a[b] a[b] := ~a[b]
btc
unsigned char _bittestandcomplement64 (__int64* a, __int64 b)

Synopsis

unsigned char _bittestandcomplement64 (__int64* a, __int64 b)
#include "immintrin.h"
Instruction: btc r64, r64

Description

Return the bit at index b of 64-bit integer a, and set that bit to its complement.

Operation

dst := a[b] a[b] := ~a[b]
btr
unsigned char _bittestandreset (__int32* a, __int32 b)

Synopsis

unsigned char _bittestandreset (__int32* a, __int32 b)
#include "immintrin.h"
Instruction: btr r32, r32

Description

Return the bit at index b of 32-bit integer a, and set that bit to zero.

Operation

dst := a[b] a[b] := 0
btr
unsigned char _bittestandreset64 (__int64* a, __int64 b)

Synopsis

unsigned char _bittestandreset64 (__int64* a, __int64 b)
#include "immintrin.h"
Instruction: btr r64, r64

Description

Return the bit at index b of 64-bit integer a, and set that bit to zero.

Operation

dst := a[b] a[b] := 0
bts
unsigned char _bittestandset (__int32* a, __int32 b)

Synopsis

unsigned char _bittestandset (__int32* a, __int32 b)
#include "immintrin.h"
Instruction: bts r32, r32

Description

Return the bit at index b of 32-bit integer a, and set that bit to one.

Operation

dst := a[b] a[b] := 1
bts
unsigned char _bittestandset64 (__int64* a, __int64 b)

Synopsis

unsigned char _bittestandset64 (__int64* a, __int64 b)
#include "immintrin.h"
Instruction: bts r64, r64

Description

Return the bit at index b of 64-bit integer a, and set that bit to one.

Operation

dst := a[b] a[b] := 1
pblendw
__m128i _mm_blend_epi16 (__m128i a, __m128i b, const int imm8)

Synopsis

__m128i _mm_blend_epi16 (__m128i a, __m128i b, const int imm8)
#include "smmintrin.h"
Instruction: pblendw xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Blend packed 16-bit integers from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 IF imm8[j%8] dst[i+15:i] := b[i+15:i] ELSE dst[i+15:i] := a[i+15:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpblendmw
__m128i _mm_mask_blend_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_blend_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpblendmw
CPUID Flags: AVX512VL + AVX512BW

Description

Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := b[i+15:i] ELSE dst[i+15:i] := a[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpblendw
__m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8)

Synopsis

__m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpblendw ymm, ymm, ymm, imm
CPUID Flags: AVX2

Description

Blend packed 16-bit integers from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 IF imm8[j%8] dst[i+15:i] := b[i+15:i] ELSE dst[i+15:i] := a[i+15:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpblendmw
__m256i _mm256_mask_blend_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_blend_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpblendmw
CPUID Flags: AVX512VL + AVX512BW

Description

Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := b[i+15:i] ELSE dst[i+15:i] := a[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpblendmw
__m512i _mm512_mask_blend_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_blend_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpblendmw
CPUID Flags: AVX512BW

Description

Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := b[i+15:i] ELSE dst[i+15:i] := a[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpblendd
__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)

Synopsis

__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpblendd xmm, xmm, xmm, imm
CPUID Flags: AVX2

Description

Blend packed 32-bit integers from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF imm8[j%8] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.33
vpblendmd
__m128i _mm_mask_blend_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_blend_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpblendmd
CPUID Flags: AVX512VL + AVX512F

Description

Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpblendd
__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)

Synopsis

__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpblendd ymm, ymm, ymm, imm
CPUID Flags: AVX2

Description

Blend packed 32-bit integers from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF imm8[j%8] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.33
vpblendmd
__m256i _mm256_mask_blend_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_blend_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpblendmd
CPUID Flags: AVX512VL + AVX512F

Description

Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpblendmd
__m512i _mm512_mask_blend_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_blend_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpblendmd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpblendmq
__m128i _mm_mask_blend_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_blend_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpblendmq
CPUID Flags: AVX512VL + AVX512F

Description

Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpblendmq
__m256i _mm256_mask_blend_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_blend_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpblendmq
CPUID Flags: AVX512VL + AVX512F

Description

Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpblendmq
__m512i _mm512_mask_blend_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_blend_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpblendmq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpblendmb
__m128i _mm_mask_blend_epi8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_blend_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpblendmb
CPUID Flags: AVX512VL + AVX512BW

Description

Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := b[i+7:i] ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpblendmb
__m256i _mm256_mask_blend_epi8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_blend_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpblendmb
CPUID Flags: AVX512VL + AVX512BW

Description

Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := b[i+7:i] ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpblendmb
__m512i _mm512_mask_blend_epi8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_blend_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpblendmb
CPUID Flags: AVX512BW

Description

Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := b[i+7:i] ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:512] := 0
blendpd
__m128d _mm_blend_pd (__m128d a, __m128d b, const int imm8)

Synopsis

__m128d _mm_blend_pd (__m128d a, __m128d b, const int imm8)
#include "smmintrin.h"
Instruction: blendpd xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Blend packed double-precision (64-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF imm8[j%8] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vblendmpd
__m128d _mm_mask_blend_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_blend_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vblendmpd
CPUID Flags: AVX512VL + AVX512F

Description

Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vblendpd
__m256d _mm256_blend_pd (__m256d a, __m256d b, const int imm8)

Synopsis

__m256d _mm256_blend_pd (__m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vblendpd ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Blend packed double-precision (64-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF imm8[j%8] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.5
Sandy Bridge10.5
vblendmpd
__m256d _mm256_mask_blend_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_blend_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vblendmpd
CPUID Flags: AVX512VL + AVX512F

Description

Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vblendmpd
__m512d _mm512_mask_blend_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_blend_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vblendmpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
blendps
__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8)

Synopsis

__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8)
#include "smmintrin.h"
Instruction: blendps xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Blend packed single-precision (32-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF imm8[j%8] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vblendmps
__m128 _mm_mask_blend_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_blend_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vblendmps
CPUID Flags: AVX512VL + AVX512F

Description

Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
vblendps
__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)

Synopsis

__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vblendps ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Blend packed single-precision (32-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF imm8[j%8] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.5
Sandy Bridge10.5
vblendmps
__m256 _mm256_mask_blend_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_blend_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vblendmps
CPUID Flags: AVX512VL + AVX512F

Description

Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
vblendmps
__m512 _mm512_mask_blend_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_blend_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vblendmps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
pblendvb
__m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask)

Synopsis

__m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask)
#include "smmintrin.h"
Instruction: pblendvb xmm, xmm
CPUID Flags: SSE4.1

Description

Blend packed 8-bit integers from a and b using mask, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 IF mask[i+7] dst[i+7:i] := b[i+7:i] ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge2-
Sandy Bridge2-
Westmere2-
Nehalem2-
vpblendvb
__m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask)

Synopsis

__m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask)
#include "immintrin.h"
Instruction: vpblendvb ymm, ymm, ymm, ymm
CPUID Flags: AVX2

Description

Blend packed 8-bit integers from a and b using mask, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 IF mask[i+7] dst[i+7:i] := b[i+7:i] ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell22
blendvpd
__m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask)

Synopsis

__m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask)
#include "smmintrin.h"
Instruction: blendvpd xmm, xmm
CPUID Flags: SSE4.1

Description

Blend packed double-precision (64-bit) floating-point elements from a and b using mask, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF mask[i+63] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell22
Ivy Bridge21
Sandy Bridge21
Westmere22
Nehalem22
vblendvpd
__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)

Synopsis

__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
#include "immintrin.h"
Instruction: vblendvpd ymm, ymm, ymm, ymm
CPUID Flags: AVX

Description

Blend packed double-precision (64-bit) floating-point elements from a and b using mask, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF mask[i+63] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell22
Ivy Bridge21
Sandy Bridge21
blendvps
__m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask)

Synopsis

__m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask)
#include "smmintrin.h"
Instruction: blendvps xmm, xmm
CPUID Flags: SSE4.1

Description

Blend packed single-precision (32-bit) floating-point elements from a and b using mask, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF mask[i+31] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell22
Ivy Bridge21
Sandy Bridge21
Westmere22
Nehalem22
vblendvps
__m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask)

Synopsis

__m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask)
#include "immintrin.h"
Instruction: vblendvps ymm, ymm, ymm, ymm
CPUID Flags: AVX

Description

Blend packed single-precision (32-bit) floating-point elements from a and b using mask, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF mask[i+31] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell22
Ivy Bridge21
Sandy Bridge21
blsi
unsigned int _blsi_u32 (unsigned int a)

Synopsis

unsigned int _blsi_u32 (unsigned int a)
#include "immintrin.h"
Instruction: blsi r32, r32
CPUID Flags: BMI1

Description

Extract the lowest set bit from unsigned 32-bit integer a and set the corresponding bit in dst. All other bits in dst are zeroed, and all bits are zeroed if no bits are set in a.

Operation

dst := (-a) BITWISE AND a

Performance

ArchitectureLatencyThroughput
Haswell1-
blsi
unsigned __int64 _blsi_u64 (unsigned __int64 a)

Synopsis

unsigned __int64 _blsi_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: blsi r64, r64
CPUID Flags: BMI1

Description

Extract the lowest set bit from unsigned 64-bit integer a and set the corresponding bit in dst. All other bits in dst are zeroed, and all bits are zeroed if no bits are set in a.

Operation

dst := (-a) BITWISE AND a

Performance

ArchitectureLatencyThroughput
Haswell1-
blsmsk
unsigned int _blsmsk_u32 (unsigned int a)

Synopsis

unsigned int _blsmsk_u32 (unsigned int a)
#include "immintrin.h"
Instruction: blsmsk r32, r32
CPUID Flags: BMI1

Description

Set all the lower bits of dst up to and including the lowest set bit in unsigned 32-bit integer a.

Operation

dst := (a - 1) XOR a

Performance

ArchitectureLatencyThroughput
Haswell1-
blsmsk
unsigned __int64 _blsmsk_u64 (unsigned __int64 a)

Synopsis

unsigned __int64 _blsmsk_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: blsmsk r64, r64
CPUID Flags: BMI1

Description

Set all the lower bits of dst up to and including the lowest set bit in unsigned 64-bit integer a.

Operation

dst := (a - 1) XOR a

Performance

ArchitectureLatencyThroughput
Haswell1-
blsr
unsigned int _blsr_u32 (unsigned int a)

Synopsis

unsigned int _blsr_u32 (unsigned int a)
#include "immintrin.h"
Instruction: blsr r32, r32
CPUID Flags: BMI1

Description

Copy all bits from unsigned 32-bit integer a to dst, and reset (set to 0) the bit in dst that corresponds to the lowest set bit in a.

Operation

dst := (a - 1) BITWISE AND a

Performance

ArchitectureLatencyThroughput
Haswell1-
blsr
unsigned __int64 _blsr_u64 (unsigned __int64 a)

Synopsis

unsigned __int64 _blsr_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: blsr r64, r64
CPUID Flags: BMI1

Description

Copy all bits from unsigned 64-bit integer a to dst, and reset (set to 0) the bit in dst that corresponds to the lowest set bit in a.

Operation

dst := (a - 1) BITWISE AND a

Performance

ArchitectureLatencyThroughput
Haswell1-
bndcu, bndcn
void _bnd_chk_ptr_bounds (const void * q, size_t size)

Synopsis

void _bnd_chk_ptr_bounds (const void * q, size_t size)
#include "immintrin.h"
Instruction: bndcu bnd, m32
             bndcn bnd, m32
CPUID Flags: MPX

Description

Checks if [q, q + size - 1] is within the lower and upper bounds of q and throws a #BR if not.

Operation

IF (q + size - 1) < q.LB OR (q + size - 1) > q.UB THEN #BR; FI;
bndcl
void _bnd_chk_ptr_lbounds (const void * q)

Synopsis

void _bnd_chk_ptr_lbounds (const void * q)
#include "immintrin.h"
Instruction: bndcl bnd, m32
CPUID Flags: MPX

Description

Checks if q is within its lower bound, and throws a #BR if not.

Operation

IF q < q.LB THEN #BR; FI;
bndcu, bndcn
void _bnd_chk_ptr_ubounds (const void * q)

Synopsis

void _bnd_chk_ptr_ubounds (const void * q)
#include "immintrin.h"
Instruction: bndcu bnd, m32
             bndcn bnd, m32
CPUID Flags: MPX

Description

Checks if q is within its upper bound, and throws a #BR if not.

Operation

IF q > q.UB THEN #BR; FI;
...
void * _bnd_copy_ptr_bounds (const void * q, const void * r)

Synopsis

void * _bnd_copy_ptr_bounds (const void * q, const void * r)
#include "immintrin.h"
CPUID Flags: MPX

Description

Make a pointer with the value of q and bounds set to the bounds of r (e.g. copy the bounds of r to pointer q), and store the result in dst.

Operation

dst := q; dst.LB := r.LB; dst.UB := r.UB;
...
const void * _bnd_get_ptr_lbound (const void * q)

Synopsis

const void * _bnd_get_ptr_lbound (const void * q)
#include "immintrin.h"
CPUID Flags: MPX

Description

Return the lower bound of q.

Operation

dst := q.LB
...
const void * _bnd_get_ptr_ubound (const void * q)

Synopsis

const void * _bnd_get_ptr_ubound (const void * q)
#include "immintrin.h"
CPUID Flags: MPX

Description

Return the upper bound of q.

Operation

dst := q.UB
...
void * _bnd_init_ptr_bounds (const void * q)

Synopsis

void * _bnd_init_ptr_bounds (const void * q)
#include "immintrin.h"
CPUID Flags: MPX

Description

Make a pointer with the value of q and open bounds, which allow the pointer to access the entire virtual address space, and store the result in dst.

Operation

dst := q; dst.LB := 0; dst.UB := 0;
...
void * _bnd_narrow_ptr_bounds (const void * q, const void * r, size_t size)

Synopsis

void * _bnd_narrow_ptr_bounds (const void * q, const void * r, size_t size)
#include "immintrin.h"
CPUID Flags: MPX

Description

Narrow the bounds for pointer q to the intersection of the bounds of r and the bounds [q, q + size - 1], and store the result in dst.

Operation

dst := q; IF r.LB > (q + size - 1) OR r.UB < q THEN dst.LB := 1; dst.UB := 0; ELSE dst.LB := MAX(r.LB, q); dst.UB := MIN(r.UB, (q + size - 1)); FI;
bndmk
void * _bnd_set_ptr_bounds (const void * srcmem, size_t size)

Synopsis

void * _bnd_set_ptr_bounds (const void * srcmem, size_t size)
#include "immintrin.h"
Instruction: bndmk bnd, m32
CPUID Flags: MPX

Description

Make a pointer with the value of srcmem and bounds set to [srcmem, srcmem + size - 1], and store the result in dst.

Operation

dst := srcmem; dst.LB := srcmem.LB; dst.UB := srcmem + size - 1;
bndstx
void _bnd_store_ptr_bounds (const void ** ptr_addr, const void * ptr_val)

Synopsis

void _bnd_store_ptr_bounds (const void ** ptr_addr, const void * ptr_val)
#include "immintrin.h"
Instruction: bndstx mib, bnd
CPUID Flags: MPX

Description

Stores the bounds of ptr_val pointer in memory at address ptr_addr.

Operation

MEM[ptr_addr].LB := ptr_val.LB; MEM[ptr_addr].UB := ptr_val.UB;
vbroadcastf32x2
__m256 _mm256_broadcast_f32x2 (__m128 a)

Synopsis

__m256 _mm256_broadcast_f32x2 (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*32 n := (j mod 2)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:256] := 0
vbroadcastf32x2
__m256 _mm256_mask_broadcast_f32x2 (__m256 src, __mmask8 k, __m128 a)

Synopsis

__m256 _mm256_mask_broadcast_f32x2 (__m256 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 n := (j mod 2)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vbroadcastf32x2
__m256 _mm256_maskz_broadcast_f32x2 (__mmask8 k, __m128 a)

Synopsis

__m256 _mm256_maskz_broadcast_f32x2 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 n := (j mod 2)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vbroadcastf32x2
__m512 _mm512_broadcast_f32x2 (__m128 a)

Synopsis

__m512 _mm512_broadcast_f32x2 (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x2
CPUID Flags: AVX512DQ

Description

Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from a to all elements of dst.

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 2)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:512] := 0
vbroadcastf32x2
__m512 _mm512_mask_broadcast_f32x2 (__m512 src, __mmask16 k, __m128 a)

Synopsis

__m512 _mm512_mask_broadcast_f32x2 (__m512 src, __mmask16 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x2
CPUID Flags: AVX512DQ

Description

Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 2)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vbroadcastf32x2
__m512 _mm512_maskz_broadcast_f32x2 (__mmask16 k, __m128 a)

Synopsis

__m512 _mm512_maskz_broadcast_f32x2 (__mmask16 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x2
CPUID Flags: AVX512DQ

Description

Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 2)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vbroadcastf32x4
__m256 _mm256_broadcast_f32x4 (__m128 a)

Synopsis

__m256 _mm256_broadcast_f32x4 (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x4
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*32 n := (j mod 4)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:256] := 0
vbroadcastf32x4
__m256 _mm256_mask_broadcast_f32x4 (__m256 src, __mmask8 k, __m128 a)

Synopsis

__m256 _mm256_mask_broadcast_f32x4 (__m256 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x4
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vbroadcastf32x4
__m256 _mm256_maskz_broadcast_f32x4 (__mmask8 k, __m128 a)

Synopsis

__m256 _mm256_maskz_broadcast_f32x4 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x4
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vbroadcastf32x4
__m512 _mm512_broadcast_f32x4 (__m128 a)

Synopsis

__m512 _mm512_broadcast_f32x4 (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x4 zmm {k}, m128
CPUID Flags: AVX512F

Description

Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 4)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:512] := 0
vbroadcastf32x4
__m512 _mm512_mask_broadcast_f32x4 (__m512 src, __mmask16 k, __m128 a)

Synopsis

__m512 _mm512_mask_broadcast_f32x4 (__m512 src, __mmask16 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x4 zmm {k}, m128
CPUID Flags: AVX512F

Description

Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vbroadcastf32x4
__m512 _mm512_maskz_broadcast_f32x4 (__mmask16 k, __m128 a)

Synopsis

__m512 _mm512_maskz_broadcast_f32x4 (__mmask16 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x4 zmm {k}, m128
CPUID Flags: AVX512F

Description

Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vbroadcastf32x8
__m512 _mm512_broadcast_f32x8 (__m256 a)

Synopsis

__m512 _mm512_broadcast_f32x8 (__m256 a)
#include "immintrin.h"
Instruction: vbroadcastf32x8
CPUID Flags: AVX512DQ

Description

Broadcast the 8 packed single-precision (32-bit) floating-point elements from a to all elements of dst.

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 8)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:512] := 0
vbroadcastf32x8
__m512 _mm512_mask_broadcast_f32x8 (__m512 src, __mmask16 k, __m256 a)

Synopsis

__m512 _mm512_mask_broadcast_f32x8 (__m512 src, __mmask16 k, __m256 a)
#include "immintrin.h"
Instruction: vbroadcastf32x8
CPUID Flags: AVX512DQ

Description

Broadcast the 8 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 8)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vbroadcastf32x8
__m512 _mm512_maskz_broadcast_f32x8 (__mmask16 k, __m256 a)

Synopsis

__m512 _mm512_maskz_broadcast_f32x8 (__mmask16 k, __m256 a)
#include "immintrin.h"
Instruction: vbroadcastf32x8
CPUID Flags: AVX512DQ

Description

Broadcast the 8 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 8)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vbroadcastf64x2
__m256d _mm256_broadcast_f64x2 (__m128d a)

Synopsis

__m256d _mm256_broadcast_f64x2 (__m128d a)
#include "immintrin.h"
Instruction: vbroadcastf64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the 2 packed double-precision (64-bit) floating-point elements from a to all elements of dst.

Operation

FOR j := 0 to 3 i := j*64 n := (j mod 2)*64 dst[i+63:i] := a[n+63:n] ENDFOR dst[MAX:256] := 0
vbroadcastf64x2
__m256d _mm256_mask_broadcast_f64x2 (__m256d src, __mmask8 k, __m128d a)

Synopsis

__m256d _mm256_mask_broadcast_f64x2 (__m256d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastf64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the 2 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 n := (j mod 2)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := src[n+63:n] FI ENDFOR dst[MAX:256] := 0
vbroadcastf64x2
__m256d _mm256_maskz_broadcast_f64x2 (__mmask8 k, __m128d a)

Synopsis

__m256d _mm256_maskz_broadcast_f64x2 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastf64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the 2 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 n := (j mod 2)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vbroadcastf64x2
__m512d _mm512_broadcast_f64x2 (__m128d a)

Synopsis

__m512d _mm512_broadcast_f64x2 (__m128d a)
#include "immintrin.h"
Instruction: vbroadcastf64x2
CPUID Flags: AVX512DQ

Description

Broadcast the 2 packed double-precision (64-bit) floating-point elements from a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*64 n := (j mod 2)*64 dst[i+63:i] := a[n+63:n] ENDFOR dst[MAX:512] := 0
vbroadcastf64x2
__m512d _mm512_mask_broadcast_f64x2 (__m512d src, __mmask8 k, __m128d a)

Synopsis

__m512d _mm512_mask_broadcast_f64x2 (__m512d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastf64x2
CPUID Flags: AVX512DQ

Description

Broadcast the 2 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 n := (j mod 2)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := src[n+63:n] FI ENDFOR dst[MAX:512] := 0
vbroadcastf64x2
__m512d _mm512_maskz_broadcast_f64x2 (__mmask8 k, __m128d a)

Synopsis

__m512d _mm512_maskz_broadcast_f64x2 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastf64x2
CPUID Flags: AVX512DQ

Description

Broadcast the 2 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 n := (j mod 2)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vbroadcastf64x4
__m512d _mm512_broadcast_f64x4 (__m256d a)

Synopsis

__m512d _mm512_broadcast_f64x4 (__m256d a)
#include "immintrin.h"
Instruction: vbroadcastf64x4 zmm {k}, m256
CPUID Flags: AVX512F

Description

Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*64 n := (j mod 4)*64 dst[i+63:i] := a[n+63:n] ENDFOR dst[MAX:512] := 0
vbroadcastf64x4
__m512d _mm512_mask_broadcast_f64x4 (__m512d src, __mmask8 k, __m256d a)

Synopsis

__m512d _mm512_mask_broadcast_f64x4 (__m512d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vbroadcastf64x4 zmm {k}, m256
CPUID Flags: AVX512F

Description

Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 n := (j mod 4)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := src[n+63:n] FI ENDFOR dst[MAX:512] := 0
vbroadcastf64x4
__m512d _mm512_maskz_broadcast_f64x4 (__mmask8 k, __m256d a)

Synopsis

__m512d _mm512_maskz_broadcast_f64x4 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vbroadcastf64x4 zmm {k}, m256
CPUID Flags: AVX512F

Description

Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 n := (j mod 4)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vbroadcasti32x2
__m128i _mm_broadcast_i32x2 (__m128i a)

Synopsis

__m128i _mm_broadcast_i32x2 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the lower 2 packed 32-bit integers from a to all elements of "dst.

Operation

FOR j := 0 to 3 i := j*32 n := (j mod 2)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:128] := 0
vbroadcasti32x2
__m128i _mm_mask_broadcast_i32x2 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_broadcast_i32x2 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the lower 2 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 n := (j mod 2)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[n+31:n] FI ENDFOR dst[MAX:128] := 0
vbroadcasti32x2
__m128i _mm_maskz_broadcast_i32x2 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_broadcast_i32x2 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 n := (j mod 2)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vbroadcasti32x2
__m256i _mm256_broadcast_i32x2 (__m128i a)

Synopsis

__m256i _mm256_broadcast_i32x2 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the lower 2 packed 32-bit integers from a to all elements of "dst.

Operation

FOR j := 0 to 7 i := j*32 n := (j mod 2)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:256] := 0
vbroadcasti32x2
__m256i _mm256_mask_broadcast_i32x2 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_broadcast_i32x2 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the lower 2 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 n := (j mod 2)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[n+31:n] FI ENDFOR dst[MAX:256] := 0
vbroadcasti32x2
__m256i _mm256_maskz_broadcast_i32x2 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_broadcast_i32x2 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 n := (j mod 2)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vbroadcasti32x2
__m512i _mm512_broadcast_i32x2 (__m128i a)

Synopsis

__m512i _mm512_broadcast_i32x2 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512DQ

Description

Broadcast the lower 2 packed 32-bit integers from a to all elements of "dst.

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 2)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:512] := 0
vbroadcasti32x2
__m512i _mm512_mask_broadcast_i32x2 (__m512i src, __mmask16 k, __m128i a)

Synopsis

__m512i _mm512_mask_broadcast_i32x2 (__m512i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512DQ

Description

Broadcast the lower 2 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 2)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[n+31:n] FI ENDFOR dst[MAX:512] := 0
vbroadcasti32x2
__m512i _mm512_maskz_broadcast_i32x2 (__mmask16 k, __m128i a)

Synopsis

__m512i _mm512_maskz_broadcast_i32x2 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512DQ

Description

Broadcast the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 2)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vbroadcasti32x4
__m256i _mm256_broadcast_i32x4 (__m128i a)

Synopsis

__m256i _mm256_broadcast_i32x4 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x4
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the 4 packed 32-bit integers from a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*32 n := (j mod 4)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:256] := 0
vbroadcasti32x4
__m256i _mm256_mask_broadcast_i32x4 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_broadcast_i32x4 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x4
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[n+31:n] FI ENDFOR dst[MAX:256] := 0
vbroadcasti32x4
__m256i _mm256_maskz_broadcast_i32x4 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_broadcast_i32x4 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x4
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vbroadcasti32x4
__m512i _mm512_broadcast_i32x4 (__m128i a)

Synopsis

__m512i _mm512_broadcast_i32x4 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x4 zmm {k}, m128
CPUID Flags: AVX512F

Description

Broadcast the 4 packed 32-bit integers from a to all elements of dst.

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 4)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:512] := 0
vbroadcasti32x4
__m512i _mm512_mask_broadcast_i32x4 (__m512i src, __mmask16 k, __m128i a)

Synopsis

__m512i _mm512_mask_broadcast_i32x4 (__m512i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x4 zmm {k}, m128
CPUID Flags: AVX512F

Description

Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[n+31:n] FI ENDFOR dst[MAX:512] := 0
vbroadcasti32x4
__m512i _mm512_maskz_broadcast_i32x4 (__mmask16 k, __m128i a)

Synopsis

__m512i _mm512_maskz_broadcast_i32x4 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x4 zmm {k}, m128
CPUID Flags: AVX512F

Description

Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vbroadcasti32x8
__m512i _mm512_broadcast_i32x8 (__m256i a)

Synopsis

__m512i _mm512_broadcast_i32x8 (__m256i a)
#include "immintrin.h"
Instruction: vbroadcasti32x8
CPUID Flags: AVX512DQ

Description

Broadcast the 8 packed 32-bit integers from a to all elements of dst.

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 8)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:512] := 0
vbroadcasti32x8
__m512i _mm512_mask_broadcast_i32x8 (__m512i src, __mmask16 k, __m256i a)

Synopsis

__m512i _mm512_mask_broadcast_i32x8 (__m512i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vbroadcasti32x8
CPUID Flags: AVX512DQ

Description

Broadcast the 8 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 8)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[n+31:n] FI ENDFOR dst[MAX:512] := 0
vbroadcasti32x8
__m512i _mm512_maskz_broadcast_i32x8 (__mmask16 k, __m256i a)

Synopsis

__m512i _mm512_maskz_broadcast_i32x8 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vbroadcasti32x8
CPUID Flags: AVX512DQ

Description

Broadcast the 8 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 n := (j mod 8)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vbroadcasti64x2
__m256i _mm256_broadcast_i64x2 (__m128i a)

Synopsis

__m256i _mm256_broadcast_i64x2 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the 2 packed 64-bit integers from a to all elements of dst.

Operation

FOR j := 0 to 3 i := j*64 n := (j mod 2)*64 dst[i+63:i] := a[n+63:n] ENDFOR dst[MAX:256] := 0
vbroadcasti64x2
__m256i _mm256_mask_broadcast_i64x2 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_broadcast_i64x2 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the 2 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 n := (j mod 2)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := src[n+63:n] FI ENDFOR dst[MAX:256] := 0
vbroadcasti64x2
__m256i _mm256_maskz_broadcast_i64x2 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_broadcast_i64x2 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Broadcast the 2 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 n := (j mod 2)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vbroadcasti64x2
__m512i _mm512_broadcast_i64x2 (__m128i a)

Synopsis

__m512i _mm512_broadcast_i64x2 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti64x2
CPUID Flags: AVX512DQ

Description

Broadcast the 2 packed 64-bit integers from a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*64 n := (j mod 2)*64 dst[i+63:i] := a[n+63:n] ENDFOR dst[MAX:512] := 0
vbroadcasti64x2
__m512i _mm512_mask_broadcast_i64x2 (__m512i src, __mmask8 k, __m128i a)

Synopsis

__m512i _mm512_mask_broadcast_i64x2 (__m512i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti64x2
CPUID Flags: AVX512DQ

Description

Broadcast the 2 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 n := (j mod 2)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := src[n+63:n] FI ENDFOR dst[MAX:512] := 0
vbroadcasti64x2
__m512i _mm512_maskz_broadcast_i64x2 (__mmask8 k, __m128i a)

Synopsis

__m512i _mm512_maskz_broadcast_i64x2 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti64x2
CPUID Flags: AVX512DQ

Description

Broadcast the 2 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 n := (j mod 2)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vbroadcasti64x4
__m512i _mm512_broadcast_i64x4 (__m256i a)

Synopsis

__m512i _mm512_broadcast_i64x4 (__m256i a)
#include "immintrin.h"
Instruction: vbroadcasti64x4 zmm {k}, m256
CPUID Flags: AVX512F

Description

Broadcast the 4 packed 64-bit integers from a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*64 n := (j mod 4)*64 dst[i+63:i] := a[n+63:n] ENDFOR dst[MAX:512] := 0
vbroadcasti64x4
__m512i _mm512_mask_broadcast_i64x4 (__m512i src, __mmask8 k, __m256i a)

Synopsis

__m512i _mm512_mask_broadcast_i64x4 (__m512i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vbroadcasti64x4 zmm {k}, m256
CPUID Flags: AVX512F

Description

Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 n := (j mod 4)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := src[n+63:n] FI ENDFOR dst[MAX:512] := 0
vbroadcasti64x4
__m512i _mm512_maskz_broadcast_i64x4 (__mmask8 k, __m256i a)

Synopsis

__m512i _mm512_maskz_broadcast_i64x4 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vbroadcasti64x4 zmm {k}, m256
CPUID Flags: AVX512F

Description

Broadcast the 4 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 n := (j mod 4)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vbroadcastf128
__m256d _mm256_broadcast_pd (__m128d const * mem_addr)

Synopsis

__m256d _mm256_broadcast_pd (__m128d const * mem_addr)
#include "immintrin.h"
Instruction: vbroadcastf128 ymm, m128
CPUID Flags: AVX

Description

Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of dst.

Operation

tmp[127:0] = MEM[mem_addr+127:mem_addr] dst[127:0] := tmp[127:0] dst[255:128] := tmp[127:0] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Ivy Bridge1-
Sandy Bridge1-
vbroadcastf128
__m256 _mm256_broadcast_ps (__m128 const * mem_addr)

Synopsis

__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
#include "immintrin.h"
Instruction: vbroadcastf128 ymm, m128
CPUID Flags: AVX

Description

Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of dst.

Operation

tmp[127:0] = MEM[mem_addr+127:mem_addr] dst[127:0] := tmp[127:0] dst[255:128] := tmp[127:0] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Ivy Bridge1-
Sandy Bridge1-
vbroadcastsd
__m256d _mm256_broadcast_sd (double const * mem_addr)

Synopsis

__m256d _mm256_broadcast_sd (double const * mem_addr)
#include "immintrin.h"
Instruction: vbroadcastsd ymm, m64
CPUID Flags: AVX

Description

Broadcast a double-precision (64-bit) floating-point element from memory to all elements of dst.

Operation

tmp[63:0] = MEM[mem_addr+63:mem_addr] FOR j := 0 to 3 i := j*64 dst[i+63:i] := tmp[63:0] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Ivy Bridge1-
Sandy Bridge1-
vbroadcastss
__m128 _mm_broadcast_ss (float const * mem_addr)

Synopsis

__m128 _mm_broadcast_ss (float const * mem_addr)
#include "immintrin.h"
Instruction: vbroadcastss xmm, m32
CPUID Flags: AVX

Description

Broadcast a single-precision (32-bit) floating-point element from memory to all elements of dst.

Operation

tmp[31:0] = MEM[mem_addr+31:mem_addr] FOR j := 0 to 3 i := j*32 dst[i+31:i] := tmp[31:0] ENDFOR dst[MAX:128] := 0
vbroadcastss
__m256 _mm256_broadcast_ss (float const * mem_addr)

Synopsis

__m256 _mm256_broadcast_ss (float const * mem_addr)
#include "immintrin.h"
Instruction: vbroadcastss ymm, m32
CPUID Flags: AVX

Description

Broadcast a single-precision (32-bit) floating-point element from memory to all elements of dst.

Operation

tmp[31:0] = MEM[mem_addr+31:mem_addr] FOR j := 0 to 7 i := j*32 dst[i+31:i] := tmp[31:0] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Ivy Bridge1-
Sandy Bridge1-
vpbroadcastb
__m128i _mm_broadcastb_epi8 (__m128i a)

Synopsis

__m128i _mm_broadcastb_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb xmm, xmm
CPUID Flags: AVX2

Description

Broadcast the low packed 8-bit integer from a to all elements of dst.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := a[7:0] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpbroadcastb
__m128i _mm_mask_broadcastb_epi8 (__m128i src, __mmask16 k, __m128i a)

Synopsis

__m128i _mm_mask_broadcastb_epi8 (__m128i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := a[7:0] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpbroadcastb
__m128i _mm_maskz_broadcastb_epi8 (__mmask16 k, __m128i a)

Synopsis

__m128i _mm_maskz_broadcastb_epi8 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := a[7:0] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpbroadcastb
__m256i _mm256_broadcastb_epi8 (__m128i a)

Synopsis

__m256i _mm256_broadcastb_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb ymm, xmm
CPUID Flags: AVX2

Description

Broadcast the low packed 8-bit integer from a to all elements of dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := a[7:0] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpbroadcastb
__m256i _mm256_mask_broadcastb_epi8 (__m256i src, __mmask32 k, __m128i a)

Synopsis

__m256i _mm256_mask_broadcastb_epi8 (__m256i src, __mmask32 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := a[7:0] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpbroadcastb
__m256i _mm256_maskz_broadcastb_epi8 (__mmask32 k, __m128i a)

Synopsis

__m256i _mm256_maskz_broadcastb_epi8 (__mmask32 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := a[7:0] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpbroadcastb
__m512i _mm512_broadcastb_epi8 (__m128i a)

Synopsis

__m512i _mm512_broadcastb_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512BW

Description

Broadcast the low packed 8-bit integer from a to all elements of dst.

Operation

FOR j := 0 to 63 i := j*8 dst[i+7:i] := a[7:0] ENDFOR dst[MAX:512] := 0
vpbroadcastb
__m512i _mm512_mask_broadcastb_epi8 (__m512i src, __mmask64 k, __m128i a)

Synopsis

__m512i _mm512_mask_broadcastb_epi8 (__m512i src, __mmask64 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512BW

Description

Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := a[7:0] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpbroadcastb
__m512i _mm512_maskz_broadcastb_epi8 (__mmask64 k, __m128i a)

Synopsis

__m512i _mm512_maskz_broadcastb_epi8 (__mmask64 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512BW

Description

Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := a[7:0] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpbroadcastd
__m128i _mm_broadcastd_epi32 (__m128i a)

Synopsis

__m128i _mm_broadcastd_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd xmm, xmm
CPUID Flags: AVX2

Description

Broadcast the low packed 32-bit integer from a to all elements of dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpbroadcastd
__m128i _mm_mask_broadcastd_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_broadcastd_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpbroadcastd
__m128i _mm_maskz_broadcastd_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_broadcastd_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpbroadcastd
__m256i _mm256_broadcastd_epi32 (__m128i a)

Synopsis

__m256i _mm256_broadcastd_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd ymm, xmm
CPUID Flags: AVX2

Description

Broadcast the low packed 32-bit integer from a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpbroadcastd
__m256i _mm256_mask_broadcastd_epi32 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_broadcastd_epi32 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpbroadcastd
__m256i _mm256_maskz_broadcastd_epi32 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_broadcastd_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpbroadcastd
__m512i _mm512_broadcastd_epi32 (__m128i a)

Synopsis

__m512i _mm512_broadcastd_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd zmm {k}, xmm
CPUID Flags: AVX512F

Description

Broadcast the low packed 32-bit integer from a to all elements of dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:512] := 0
vpbroadcastd
__m512i _mm512_mask_broadcastd_epi32 (__m512i src, __mmask16 k, __m128i a)

Synopsis

__m512i _mm512_mask_broadcastd_epi32 (__m512i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd zmm {k}, xmm
CPUID Flags: AVX512F

Description

Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpbroadcastd
__m512i _mm512_maskz_broadcastd_epi32 (__mmask16 k, __m128i a)

Synopsis

__m512i _mm512_maskz_broadcastd_epi32 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd zmm {k}, xmm
CPUID Flags: AVX512F

Description

Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpbroadcastmb2q
__m128i _mm_broadcastmb_epi64 (__mmask8 k)

Synopsis

__m128i _mm_broadcastmb_epi64 (__mmask8 k)
#include "immintrin.h"
Instruction: vpbroadcastmb2q
CPUID Flags: AVX512VL + AVX512CD

Description

Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ZeroExtend(k[7:0]) ENDFOR dst[MAX:128] := 0
vpbroadcastmb2q
__m256i _mm256_broadcastmb_epi64 (__mmask8 k)

Synopsis

__m256i _mm256_broadcastmb_epi64 (__mmask8 k)
#include "immintrin.h"
Instruction: vpbroadcastmb2q
CPUID Flags: AVX512VL + AVX512CD

Description

Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ZeroExtend(k[7:0]) ENDFOR dst[MAX:256] := 0
vpbroadcastmb2q
__m512i _mm512_broadcastmb_epi64 (__mmask8 k)

Synopsis

__m512i _mm512_broadcastmb_epi64 (__mmask8 k)
#include "immintrin.h"
Instruction: vpbroadcastmb2q zmm, k
CPUID Flags: AVX512CD

Description

Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ZeroExtend(k[7:0]) ENDFOR dst[MAX:512] := 0
vpbroadcastmw2d
__m128i _mm_broadcastmw_epi32 (__mmask16 k)

Synopsis

__m128i _mm_broadcastmw_epi32 (__mmask16 k)
#include "immintrin.h"
Instruction: vpbroadcastmw2d
CPUID Flags: AVX512VL + AVX512CD

Description

Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ZeroExtend(k[15:0]) ENDFOR dst[MAX:128] := 0
vpbroadcastmw2d
__m256i _mm256_broadcastmw_epi32 (__mmask16 k)

Synopsis

__m256i _mm256_broadcastmw_epi32 (__mmask16 k)
#include "immintrin.h"
Instruction: vpbroadcastmw2d
CPUID Flags: AVX512VL + AVX512CD

Description

Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ZeroExtend(k[15:0]) ENDFOR dst[MAX:256] := 0
vpbroadcastmw2d
__m512i _mm512_broadcastmw_epi32 (__mmask16 k)

Synopsis

__m512i _mm512_broadcastmw_epi32 (__mmask16 k)
#include "immintrin.h"
Instruction: vpbroadcastmw2d zmm, k
CPUID Flags: AVX512CD

Description

Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ZeroExtend(k[15:0]) ENDFOR dst[MAX:512] := 0
vpbroadcastq
__m128i _mm_broadcastq_epi64 (__m128i a)

Synopsis

__m128i _mm_broadcastq_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq xmm, xmm
CPUID Flags: AVX2

Description

Broadcast the low packed 64-bit integer from a to all elements of dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpbroadcastq
__m128i _mm_mask_broadcastq_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_broadcastq_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpbroadcastq
__m128i _mm_maskz_broadcastq_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_broadcastq_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpbroadcastq
__m256i _mm256_broadcastq_epi64 (__m128i a)

Synopsis

__m256i _mm256_broadcastq_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq ymm, xmm
CPUID Flags: AVX2

Description

Broadcast the low packed 64-bit integer from a to all elements of dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpbroadcastq
__m256i _mm256_mask_broadcastq_epi64 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_broadcastq_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpbroadcastq
__m256i _mm256_maskz_broadcastq_epi64 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_broadcastq_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpbroadcastq
__m512i _mm512_broadcastq_epi64 (__m128i a)

Synopsis

__m512i _mm512_broadcastq_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Broadcast the low packed 64-bit integer from a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:512] := 0
vpbroadcastq
__m512i _mm512_mask_broadcastq_epi64 (__m512i src, __mmask8 k, __m128i a)

Synopsis

__m512i _mm512_mask_broadcastq_epi64 (__m512i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpbroadcastq
__m512i _mm512_maskz_broadcastq_epi64 (__mmask8 k, __m128i a)

Synopsis

__m512i _mm512_maskz_broadcastq_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
movddup
__m128d _mm_broadcastsd_pd (__m128d a)

Synopsis

__m128d _mm_broadcastsd_pd (__m128d a)
#include "immintrin.h"
Instruction: movddup xmm, xmm
CPUID Flags: AVX2

Description

Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
vbroadcastsd
__m256d _mm256_broadcastsd_pd (__m128d a)

Synopsis

__m256d _mm256_broadcastsd_pd (__m128d a)
#include "immintrin.h"
Instruction: vbroadcastsd ymm, xmm
CPUID Flags: AVX2

Description

Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vbroadcastsd
__m256d _mm256_mask_broadcastsd_pd (__m256d src, __mmask8 k, __m128d a)

Synopsis

__m256d _mm256_mask_broadcastsd_pd (__m256d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastsd
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vbroadcastsd
__m256d _mm256_maskz_broadcastsd_pd (__mmask8 k, __m128d a)

Synopsis

__m256d _mm256_maskz_broadcastsd_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastsd
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vbroadcastsd
__m512d _mm512_broadcastsd_pd (__m128d a)

Synopsis

__m512d _mm512_broadcastsd_pd (__m128d a)
#include "immintrin.h"
Instruction: vbroadcastsd zmm {k}, xmm
CPUID Flags: AVX512F

Description

Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:512] := 0
vbroadcastsd
__m512d _mm512_mask_broadcastsd_pd (__m512d src, __mmask8 k, __m128d a)

Synopsis

__m512d _mm512_mask_broadcastsd_pd (__m512d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastsd zmm {k}, xmm
CPUID Flags: AVX512F

Description

Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vbroadcastsd
__m512d _mm512_maskz_broadcastsd_pd (__mmask8 k, __m128d a)

Synopsis

__m512d _mm512_maskz_broadcastsd_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastsd zmm {k}, xmm
CPUID Flags: AVX512F

Description

Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vbroadcasti128
__m256i _mm256_broadcastsi128_si256 (__m128i a)

Synopsis

__m256i _mm256_broadcastsi128_si256 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti128 ymm, m128
CPUID Flags: AVX2

Description

Broadcast 128 bits of integer data from a to all 128-bit lanes in dst.

Operation

dst[127:0] := a[127:0] dst[255:128] := a[127:0] dst[MAX:256] := 0
vbroadcastss
__m128 _mm_broadcastss_ps (__m128 a)

Synopsis

__m128 _mm_broadcastss_ps (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastss xmm, xmm
CPUID Flags: AVX2

Description

Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vbroadcastss
__m128 _mm_mask_broadcastss_ps (__m128 src, __mmask8 k, __m128 a)

Synopsis

__m128 _mm_mask_broadcastss_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastss
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vbroadcastss
__m128 _mm_maskz_broadcastss_ps (__mmask8 k, __m128 a)

Synopsis

__m128 _mm_maskz_broadcastss_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastss
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vbroadcastss
__m256 _mm256_broadcastss_ps (__m128 a)

Synopsis

__m256 _mm256_broadcastss_ps (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastss ymm, xmm
CPUID Flags: AVX2

Description

Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vbroadcastss
__m256 _mm256_mask_broadcastss_ps (__m256 src, __mmask8 k, __m128 a)

Synopsis

__m256 _mm256_mask_broadcastss_ps (__m256 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastss
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vbroadcastss
__m256 _mm256_maskz_broadcastss_ps (__mmask8 k, __m128 a)

Synopsis

__m256 _mm256_maskz_broadcastss_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastss
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vbroadcastss
__m512 _mm512_broadcastss_ps (__m128 a)

Synopsis

__m512 _mm512_broadcastss_ps (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastss zmm {k}, xmm
CPUID Flags: AVX512F

Description

Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:512] := 0
vbroadcastss
__m512 _mm512_mask_broadcastss_ps (__m512 src, __mmask16 k, __m128 a)

Synopsis

__m512 _mm512_mask_broadcastss_ps (__m512 src, __mmask16 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastss zmm {k}, xmm
CPUID Flags: AVX512F

Description

Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vbroadcastss
__m512 _mm512_maskz_broadcastss_ps (__mmask16 k, __m128 a)

Synopsis

__m512 _mm512_maskz_broadcastss_ps (__mmask16 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastss zmm {k}, xmm
CPUID Flags: AVX512F

Description

Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpbroadcastw
__m128i _mm_broadcastw_epi16 (__m128i a)

Synopsis

__m128i _mm_broadcastw_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw xmm, xmm
CPUID Flags: AVX2

Description

Broadcast the low packed 16-bit integer from a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := a[15:0] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpbroadcastw
__m128i _mm_mask_broadcastw_epi16 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_broadcastw_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpbroadcastw
__m128i _mm_maskz_broadcastw_epi16 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_broadcastw_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpbroadcastw
__m256i _mm256_broadcastw_epi16 (__m128i a)

Synopsis

__m256i _mm256_broadcastw_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw ymm, xmm
CPUID Flags: AVX2

Description

Broadcast the low packed 16-bit integer from a to all elements of dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := a[15:0] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpbroadcastw
__m256i _mm256_mask_broadcastw_epi16 (__m256i src, __mmask16 k, __m128i a)

Synopsis

__m256i _mm256_mask_broadcastw_epi16 (__m256i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpbroadcastw
__m256i _mm256_maskz_broadcastw_epi16 (__mmask16 k, __m128i a)

Synopsis

__m256i _mm256_maskz_broadcastw_epi16 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpbroadcastw
__m512i _mm512_broadcastw_epi16 (__m128i a)

Synopsis

__m512i _mm512_broadcastw_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512BW

Description

Broadcast the low packed 16-bit integer from a to all elements of dst.

Operation

FOR j := 0 to 31 i := j*16 dst[i+15:i] := a[15:0] ENDFOR dst[MAX:512] := 0
vpbroadcastw
__m512i _mm512_mask_broadcastw_epi16 (__m512i src, __mmask32 k, __m128i a)

Synopsis

__m512i _mm512_mask_broadcastw_epi16 (__m512i src, __mmask32 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512BW

Description

Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpbroadcastw
__m512i _mm512_maskz_broadcastw_epi16 (__mmask32 k, __m128i a)

Synopsis

__m512i _mm512_maskz_broadcastw_epi16 (__mmask32 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512BW

Description

Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpslldq
__m256i _mm256_bslli_epi128 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_bslli_epi128 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpslldq ymm, ymm, imm
CPUID Flags: AVX2

Description

Shift 128-bit lanes in a left by imm8 bytes while shifting in zeros, and store the results in dst.

Operation

tmp := imm8[7:0] IF tmp > 15 tmp := 16 FI dst[127:0] := a[127:0] << (tmp*8) dst[255:128] := a[255:128] << (tmp*8) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpslldq
__m512i _mm512_bslli_epi128 (__m512i a, int imm8)

Synopsis

__m512i _mm512_bslli_epi128 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vpslldq
CPUID Flags: AVX512BW

Description

Shift 128-bit lanes in a left by imm8 bytes while shifting in zeros, and store the results in dst.

Operation

tmp := imm8[7:0] IF tmp > 15 tmp := 16 FI dst[127:0] := a[127:0] << (tmp*8) dst[255:128] := a[255:128] << (tmp*8) dst[383:256] := a[383:256] << (tmp*8) dst[511:384] := a[511:384] << (tmp*8) dst[MAX:512] := 0
pslldq
__m128i _mm_bslli_si128 (__m128i a, int imm8)

Synopsis

__m128i _mm_bslli_si128 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pslldq xmm, imm
CPUID Flags: SSE2

Description

Shift a left by imm8 bytes while shifting in zeros, and store the results in dst.

Operation

tmp := imm8[7:0] IF tmp > 15 tmp := 16 FI dst[127:0] := a[127:0] << (tmp*8)

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpsrldq
__m256i _mm256_bsrli_epi128 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_bsrli_epi128 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpsrldq ymm, ymm, imm
CPUID Flags: AVX2

Description

Shift 128-bit lanes in a right by imm8 bytes while shifting in zeros, and store the results in dst.

Operation

tmp := imm8[7:0] IF tmp > 15 tmp := 16 FI dst[127:0] := a[127:0] >> (tmp*8) dst[255:128] := a[255:128] >> (tmp*8) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsrldq
__m512i _mm512_bsrli_epi128 (__m512i a, int imm8)

Synopsis

__m512i _mm512_bsrli_epi128 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vpsrldq
CPUID Flags: AVX512BW

Description

Shift 128-bit lanes in a right by imm8 bytes while shifting in zeros, and store the results in dst.

Operation

tmp := imm8[7:0] IF tmp > 15 tmp := 16 FI dst[127:0] := a[127:0] >> (tmp*8) dst[255:128] := a[255:128] >> (tmp*8) dst[383:256] := a[383:256] >> (tmp*8) dst[511:384] := a[511:384] >> (tmp*8) dst[MAX:512] := 0
psrldq
__m128i _mm_bsrli_si128 (__m128i a, int imm8)

Synopsis

__m128i _mm_bsrli_si128 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psrldq xmm, imm
CPUID Flags: SSE2

Description

Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.

Operation

tmp := imm8[7:0] IF tmp > 15 tmp := 16 FI dst[127:0] := a[127:0] >> (tmp*8)

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
bswap
int _bswap (int a)

Synopsis

int _bswap (int a)
#include "immintrin.h"
Instruction: bswap r32

Description

Reverse the byte order of 32-bit integer a, and store the result in dst. This intrinsic is provided for conversion between little and big endian values.

Operation

dst[7:0] := a[31:24] dst[15:8] := a[23:16] dst[23:16] := a[15:8] dst[31:24] := a[7:0]

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
Westmere3-
Nehalem3-
bswap
__int64 _bswap64 (__int64 a)

Synopsis

__int64 _bswap64 (__int64 a)
#include "immintrin.h"
Instruction: bswap r64

Description

Reverse the byte order of 64-bit integer a, and store the result in dst. This intrinsic is provided for conversion between little and big endian values.

Operation

dst[7:0] := a[63:56] dst[15:8] := a[55:48] dst[23:16] := a[47:40] dst[31:24] := a[39:32] dst[39:32] := a[31:24] dst[47:40] := a[23:16] dst[55:48] := a[15:8] dst[63:56] := a[7:0]

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
Westmere3-
Nehalem3-
bzhi
unsigned int _bzhi_u32 (unsigned int a, unsigned int index)

Synopsis

unsigned int _bzhi_u32 (unsigned int a, unsigned int index)
#include "immintrin.h"
Instruction: bzhi r32, r32, r32
CPUID Flags: BMI2

Description

Copy all bits from unsigned 32-bit integer a to dst, and reset (set to 0) the high bits in dst starting at index.

Operation

n := index[7:0] dst := a IF (n < 32) dst[31:n] := 0 FI

Performance

ArchitectureLatencyThroughput
Haswell1-
bzhi
unsigned __int64 _bzhi_u64 (unsigned __int64 a, unsigned int index)

Synopsis

unsigned __int64 _bzhi_u64 (unsigned __int64 a, unsigned int index)
#include "immintrin.h"
Instruction: bzhi r64, r64, r64
CPUID Flags: BMI2

Description

Copy all bits from unsigned 64-bit integer a to dst, and reset (set to 0) the high bits in dst starting at index.

Operation

n := index[7:0] dst := a IF (n < 64) dst[63:n] := 0 FI

Performance

ArchitectureLatencyThroughput
Haswell1-
unsigned __int32 _castf32_u32 (float a)

Synopsis

unsigned __int32 _castf32_u32 (float a)
#include "immintrin.h"

Description

Cast from type float to type unsigned __int32 without conversion. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
unsigned __int64 _castf64_u64 (double a)

Synopsis

unsigned __int64 _castf64_u64 (double a)
#include "immintrin.h"

Description

Cast from type double to type unsigned __int64 without conversion. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128 _mm_castpd_ps (__m128d a)

Synopsis

__m128 _mm_castpd_ps (__m128d a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Cast vector of type __m128d to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256 _mm256_castpd_ps (__m256d a)

Synopsis

__m256 _mm256_castpd_ps (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Cast vector of type __m256d to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512 _mm512_castpd_ps (__m512d a)

Synopsis

__m512 _mm512_castpd_ps (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128i _mm_castpd_si128 (__m128d a)

Synopsis

__m128i _mm_castpd_si128 (__m128d a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Cast vector of type __m128d to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256i _mm256_castpd_si256 (__m256d a)

Synopsis

__m256i _mm256_castpd_si256 (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Casts vector of type __m256d to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512i _mm512_castpd_si512 (__m512d a)

Synopsis

__m512i _mm512_castpd_si512 (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Cast vector of type __m512d to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256d _mm256_castpd128_pd256 (__m128d a)

Synopsis

__m256d _mm256_castpd128_pd256 (__m128d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Casts vector of type __m128d to type __m256d; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512d _mm512_castpd128_pd512 (__m128d a)

Synopsis

__m512d _mm512_castpd128_pd512 (__m128d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128d _mm256_castpd256_pd128 (__m256d a)

Synopsis

__m128d _mm256_castpd256_pd128 (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Casts vector of type __m256d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512d _mm512_castpd256_pd512 (__m256d a)

Synopsis

__m512d _mm512_castpd256_pd512 (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128d _mm512_castpd512_pd128 (__m512d a)

Synopsis

__m128d _mm512_castpd512_pd128 (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256d _mm512_castpd512_pd256 (__m512d a)

Synopsis

__m256d _mm512_castpd512_pd256 (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128d _mm_castps_pd (__m128 a)

Synopsis

__m128d _mm_castps_pd (__m128 a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Cast vector of type __m128 to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256d _mm256_castps_pd (__m256 a)

Synopsis

__m256d _mm256_castps_pd (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Cast vector of type __m256 to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512d _mm512_castps_pd (__m512 a)

Synopsis

__m512d _mm512_castps_pd (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128i _mm_castps_si128 (__m128 a)

Synopsis

__m128i _mm_castps_si128 (__m128 a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Cast vector of type __m128 to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256i _mm256_castps_si256 (__m256 a)

Synopsis

__m256i _mm256_castps_si256 (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Casts vector of type __m256 to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512i _mm512_castps_si512 (__m512 a)

Synopsis

__m512i _mm512_castps_si512 (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Cast vector of type __m512 to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256 _mm256_castps128_ps256 (__m128 a)

Synopsis

__m256 _mm256_castps128_ps256 (__m128 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Casts vector of type __m128 to type __m256; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512 _mm512_castps128_ps512 (__m128 a)

Synopsis

__m512 _mm512_castps128_ps512 (__m128 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128 _mm256_castps256_ps128 (__m256 a)

Synopsis

__m128 _mm256_castps256_ps128 (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Casts vector of type __m256 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512 _mm512_castps256_ps512 (__m256 a)

Synopsis

__m512 _mm512_castps256_ps512 (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128 _mm512_castps512_ps128 (__m512 a)

Synopsis

__m128 _mm512_castps512_ps128 (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Cast vector of type __m512 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256 _mm512_castps512_ps256 (__m512 a)

Synopsis

__m256 _mm512_castps512_ps256 (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128d _mm_castsi128_pd (__m128i a)

Synopsis

__m128d _mm_castsi128_pd (__m128i a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Cast vector of type __m128i to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128 _mm_castsi128_ps (__m128i a)

Synopsis

__m128 _mm_castsi128_ps (__m128i a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Cast vector of type __m128i to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256i _mm256_castsi128_si256 (__m128i a)

Synopsis

__m256i _mm256_castsi128_si256 (__m128i a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Casts vector of type __m128i to type __m256i; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512i _mm512_castsi128_si512 (__m128i a)

Synopsis

__m512i _mm512_castsi128_si512 (__m128i a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256d _mm256_castsi256_pd (__m256i a)

Synopsis

__m256d _mm256_castsi256_pd (__m256i a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Casts vector of type __m256i to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256 _mm256_castsi256_ps (__m256i a)

Synopsis

__m256 _mm256_castsi256_ps (__m256i a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Casts vector of type __m256i to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128i _mm256_castsi256_si128 (__m256i a)

Synopsis

__m128i _mm256_castsi256_si128 (__m256i a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Casts vector of type __m256i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512i _mm512_castsi256_si512 (__m256i a)

Synopsis

__m512i _mm512_castsi256_si512 (__m256i a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512d _mm512_castsi512_pd (__m512i a)

Synopsis

__m512d _mm512_castsi512_pd (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Cast vector of type __m512i to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512 _mm512_castsi512_ps (__m512i a)

Synopsis

__m512 _mm512_castsi512_ps (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128i _mm512_castsi512_si128 (__m512i a)

Synopsis

__m128i _mm512_castsi512_si128 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256i _mm512_castsi512_si256 (__m512i a)

Synopsis

__m256i _mm512_castsi512_si256 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
float _castu32_f32 (unsigned __int32 a)

Synopsis

float _castu32_f32 (unsigned __int32 a)
#include "immintrin.h"

Description

Cast from type unsigned __int32 to type float without conversion. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
double _castu64_f64 (unsigned __int64 a)

Synopsis

double _castu64_f64 (unsigned __int64 a)
#include "immintrin.h"

Description

Cast from type unsigned __int64 to type double without conversion. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
...
__m128d _mm_cbrt_pd (__m128d a)

Synopsis

__m128d _mm_cbrt_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the cube root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := CubeRoot(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_cbrt_pd (__m256d a)

Synopsis

__m256d _mm256_cbrt_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the cube root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := CubeRoot(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_cbrt_pd (__m512d a)

Synopsis

__m512d _mm512_cbrt_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cube root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := CubeRoot(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_cbrt_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_cbrt_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cube root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := CubeRoot(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_cbrt_ps (__m128 a)

Synopsis

__m128 _mm_cbrt_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the cube root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := CubeRoot(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_cbrt_ps (__m256 a)

Synopsis

__m256 _mm256_cbrt_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the cube root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := CubeRoot(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_cbrt_ps (__m512 a)

Synopsis

__m512 _mm512_cbrt_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cube root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := CubeRoot(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_cbrt_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_cbrt_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cube root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := CubeRoot(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_cdfnorm_pd (__m128d a)

Synopsis

__m128d _mm_cdfnorm_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := CDFNormal(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_cdfnorm_pd (__m256d a)

Synopsis

__m256d _mm256_cdfnorm_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := CDFNormal(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_cdfnorm_pd (__m512d a)

Synopsis

__m512d _mm512_cdfnorm_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := CDFNormal(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_cdfnorm_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_cdfnorm_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := CDFNormal(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_cdfnorm_ps (__m128 a)

Synopsis

__m128 _mm_cdfnorm_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := CDFNormal(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_cdfnorm_ps (__m256 a)

Synopsis

__m256 _mm256_cdfnorm_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := CDFNormal(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_cdfnorm_ps (__m512 a)

Synopsis

__m512 _mm512_cdfnorm_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := CDFNormal(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_cdfnorm_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_cdfnorm_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := CDFNormal(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_cdfnorminv_pd (__m128d a)

Synopsis

__m128d _mm_cdfnorminv_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := InverseCDFNormal(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_cdfnorminv_pd (__m256d a)

Synopsis

__m256d _mm256_cdfnorminv_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := InverseCDFNormal(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_cdfnorminv_pd (__m512d a)

Synopsis

__m512d _mm512_cdfnorminv_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := InverseCDFNormal(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_cdfnorminv_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_cdfnorminv_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := InverseCDFNormal(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_cdfnorminv_ps (__m128 a)

Synopsis

__m128 _mm_cdfnorminv_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := InverseCDFNormal(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_cdfnorminv_ps (__m256 a)

Synopsis

__m256 _mm256_cdfnorminv_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := InverseCDFNormal(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_cdfnorminv_ps (__m512 a)

Synopsis

__m512 _mm512_cdfnorminv_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := InverseCDFNormal(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_cdfnorminv_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_cdfnorminv_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := InverseCDFNormal(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
roundpd
__m128d _mm_ceil_pd (__m128d a)

Synopsis

__m128d _mm_ceil_pd (__m128d a)
#include "smmintrin.h"
Instruction: roundpd xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := CEIL(a[i+63:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell62
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vroundpd
__m256d _mm256_ceil_pd (__m256d a)

Synopsis

__m256d _mm256_ceil_pd (__m256d a)
#include "immintrin.h"
Instruction: vroundpd ymm, ymm, imm
CPUID Flags: AVX

Description

Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := CEIL(a[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell61
Ivy Bridge31
Sandy Bridge31
...
__m512d _mm512_ceil_pd (__m512d a)

Synopsis

__m512d _mm512_ceil_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := CEIL(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_ceil_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_ceil_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := CEIL(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
roundps
__m128 _mm_ceil_ps (__m128 a)

Synopsis

__m128 _mm_ceil_ps (__m128 a)
#include "smmintrin.h"
Instruction: roundps xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := CEIL(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell62
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vroundps
__m256 _mm256_ceil_ps (__m256 a)

Synopsis

__m256 _mm256_ceil_ps (__m256 a)
#include "immintrin.h"
Instruction: vroundps ymm, ymm, imm
CPUID Flags: AVX

Description

Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := CEIL(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell61
Ivy Bridge31
Sandy Bridge31
...
__m512 _mm512_ceil_ps (__m512 a)

Synopsis

__m512 _mm512_ceil_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := CEIL(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_ceil_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_ceil_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := CEIL(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
roundsd
__m128d _mm_ceil_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_ceil_sd (__m128d a, __m128d b)
#include "smmintrin.h"
Instruction: roundsd xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Round the lower double-precision (64-bit) floating-point element in b up to an integer value, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := CEIL(b[63:0]) dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell62
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
roundss
__m128 _mm_ceil_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_ceil_ss (__m128 a, __m128 b)
#include "smmintrin.h"
Instruction: roundss xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Round the lower single-precision (32-bit) floating-point element in b up to an integer value, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := CEIL(b[31:0]) dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell62
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
...
__m128 _mm_cexp_ps (__m128 a)

Synopsis

__m128 _mm_cexp_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the exponential value of e raised to the power of packed complex single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := e^(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_cexp_ps (__m256 a)

Synopsis

__m256 _mm256_cexp_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the exponential value of e raised to the power of packed complex single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := e^(a[i+31:i]) ENDFOR dst[MAX:256] := 0
clevict0, clevict1
void _mm_clevict (const void * ptr, int level)

Synopsis

void _mm_clevict (const void * ptr, int level)
#include "immintrin.h"
Instruction: clevict0 m
             clevict1 m
CPUID Flags: KNCNI

Description

Evicts the cache line containing the address ptr from cache level level (can be either 0 or 1).

Operation

CacheLineEvict(ptr, level)
clflush
void _mm_clflush (void const* p)

Synopsis

void _mm_clflush (void const* p)
#include "emmintrin.h"
Instruction: clflush mprefetch
CPUID Flags: SSE2

Description

Invalidate and flush the cache line that contains p from all levels of the cache hierarchy.

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
Westmere2-
Nehalem2-
clflushopt
void _mm_clflushopt (void const * p)

Synopsis

void _mm_clflushopt (void const * p)
#include "immintrin.h"
Instruction: clflushopt
CPUID Flags: CLFLUSHOPT

Description

Invalidate and flush the cache line that contains p from all levels of the cache hierarchy.
pclmulqdq
__m128i _mm_clmulepi64_si128 (__m128i a, __m128i b, const int imm8)

Synopsis

__m128i _mm_clmulepi64_si128 (__m128i a, __m128i b, const int imm8)
#include "wmmintrin.h"
Instruction: pclmulqdq xmm, xmm, imm
CPUID Flags: PCLMULQDQ

Description

Perform a carry-less multiplication of two 64-bit integers, selected from a and b according to imm8, and store the results in dst.

Operation

IF (imm8[0] = 0) TEMP1 := a[63:0]; ELSE TEMP1 := a[127:64]; FI IF (imm8[4] = 0) TEMP2 := b[63:0]; ELSE TEMP2 := b[127:64]; FI FOR i := 0 to 63 TEMP[i] := (TEMP1[0] and TEMP2[i]); FOR j := 1 to i TEMP [i] := TEMP [i] XOR (TEMP1[j] AND TEMP2[i-j]) ENDFOR dst[i] := TEMP[i]; ENDFOR FOR i := 64 to 127 TEMP [i] := 0; FOR j := (i - 63) to 63 TEMP [i] := TEMP [i] XOR (TEMP1[j] AND TEMP2[i-j]) ENDFOR dst[i] := TEMP[i]; ENDFOR dst[127] := 0

Performance

ArchitectureLatencyThroughput
Haswell72
Ivy Bridge148
Sandy Bridge148
Westmere148
...
__m128 _mm_clog_ps (__m128 a)

Synopsis

__m128 _mm_clog_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the natural logarithm of packed complex single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ln(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_clog_ps (__m256 a)

Synopsis

__m256 _mm256_clog_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the natural logarithm of packed complex single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ln(a[i+31:i]) ENDFOR dst[MAX:256] := 0
vpcmpw
__mmask8 _mm_cmp_epi16_mask (__m128i a, __m128i b, const int imm8)

Synopsis

__mmask8 _mm_cmp_epi16_mask (__m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmp_epi16_mask (__mmask8 k1, __m128i a, __m128i b, const int imm8)

Synopsis

__mmask8 _mm_mask_cmp_epi16_mask (__mmask8 k1, __m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmp_epi16_mask (__m256i a, __m256i b, const int imm8)

Synopsis

__mmask16 _mm256_cmp_epi16_mask (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmp_epi16_mask (__mmask16 k1, __m256i a, __m256i b, const int imm8)

Synopsis

__mmask16 _mm256_mask_cmp_epi16_mask (__mmask16 k1, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmp_epi16_mask (__m512i a, __m512i b, const int imm8)

Synopsis

__mmask32 _mm512_cmp_epi16_mask (__m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmp_epi16_mask (__mmask32 k1, __m512i a, __m512i b, const int imm8)

Synopsis

__mmask32 _mm512_mask_cmp_epi16_mask (__mmask32 k1, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpd
__mmask8 _mm_cmp_epi32_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm_cmp_epi32_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmp_epi32_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm_mask_cmp_epi32_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmp_epi32_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm256_cmp_epi32_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmp_epi32_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm256_mask_cmp_epi32_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpd
__mmask16 _mm512_cmp_epi32_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask16 _mm512_cmp_epi32_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpd
__mmask16 _mm512_mask_cmp_epi32_mask (__mmask16 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask16 _mm512_mask_cmp_epi32_mask (__mmask16 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpq
__mmask8 _mm_cmp_epi64_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm_cmp_epi64_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmp_epi64_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm_mask_cmp_epi64_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmp_epi64_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm256_cmp_epi64_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmp_epi64_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm256_mask_cmp_epi64_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpq
__mmask8 _mm512_cmp_epi64_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm512_cmp_epi64_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpq
__mmask8 _mm512_mask_cmp_epi64_mask (__mmask8 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm512_mask_cmp_epi64_mask (__mmask8 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpb
__mmask16 _mm_cmp_epi8_mask (__m128i a, __m128i b, const int imm8)

Synopsis

__mmask16 _mm_cmp_epi8_mask (__m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmp_epi8_mask (__mmask16 k1, __m128i a, __m128i b, const int imm8)

Synopsis

__mmask16 _mm_mask_cmp_epi8_mask (__mmask16 k1, __m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmp_epi8_mask (__m256i a, __m256i b, const int imm8)

Synopsis

__mmask32 _mm256_cmp_epi8_mask (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmp_epi8_mask (__mmask32 k1, __m256i a, __m256i b, const int imm8)

Synopsis

__mmask32 _mm256_mask_cmp_epi8_mask (__mmask32 k1, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmp_epi8_mask (__m512i a, __m512i b, const int imm8)

Synopsis

__mmask64 _mm512_cmp_epi8_mask (__m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmp_epi8_mask (__mmask64 k1, __m512i a, __m512i b, const int imm8)

Synopsis

__mmask64 _mm512_mask_cmp_epi8_mask (__mmask64 k1, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmp_epu16_mask (__m128i a, __m128i b, const int imm8)

Synopsis

__mmask8 _mm_cmp_epu16_mask (__m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmp_epu16_mask (__mmask8 k1, __m128i a, __m128i b, const int imm8)

Synopsis

__mmask8 _mm_mask_cmp_epu16_mask (__mmask8 k1, __m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmp_epu16_mask (__m256i a, __m256i b, const int imm8)

Synopsis

__mmask16 _mm256_cmp_epu16_mask (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmp_epu16_mask (__mmask16 k1, __m256i a, __m256i b, const int imm8)

Synopsis

__mmask16 _mm256_mask_cmp_epu16_mask (__mmask16 k1, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmp_epu16_mask (__m512i a, __m512i b, const int imm8)

Synopsis

__mmask32 _mm512_cmp_epu16_mask (__m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmp_epu16_mask (__mmask32 k1, __m512i a, __m512i b, const int imm8)

Synopsis

__mmask32 _mm512_mask_cmp_epu16_mask (__mmask32 k1, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmp_epu32_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm_cmp_epu32_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmp_epu32_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm_mask_cmp_epu32_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmp_epu32_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm256_cmp_epu32_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmp_epu32_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm256_mask_cmp_epu32_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmp_epu32_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask16 _mm512_cmp_epu32_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmp_epu32_mask (__mmask16 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask16 _mm512_mask_cmp_epu32_mask (__mmask16 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmp_epu64_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm_cmp_epu64_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmp_epu64_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm_mask_cmp_epu64_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmp_epu64_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm256_cmp_epu64_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmp_epu64_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm256_mask_cmp_epu64_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmp_epu64_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm512_cmp_epu64_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmp_epu64_mask (__mmask8 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)

Synopsis

__mmask8 _mm512_mask_cmp_epu64_mask (__mmask8 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmp_epu8_mask (__m128i a, __m128i b, const int imm8)

Synopsis

__mmask16 _mm_cmp_epu8_mask (__m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmp_epu8_mask (__mmask16 k1, __m128i a, __m128i b, const int imm8)

Synopsis

__mmask16 _mm_mask_cmp_epu8_mask (__mmask16 k1, __m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmp_epu8_mask (__m256i a, __m256i b, const int imm8)

Synopsis

__mmask32 _mm256_cmp_epu8_mask (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmp_epu8_mask (__mmask32 k1, __m256i a, __m256i b, const int imm8)

Synopsis

__mmask32 _mm256_mask_cmp_epu8_mask (__mmask32 k1, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmp_epu8_mask (__m512i a, __m512i b, const int imm8)

Synopsis

__mmask64 _mm512_cmp_epu8_mask (__m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmp_epu8_mask (__mmask64 k1, __m512i a, __m512i b, const int imm8)

Synopsis

__mmask64 _mm512_mask_cmp_epu8_mask (__mmask64 k1, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
vcmppd
__m128d _mm_cmp_pd (__m128d a, __m128d b, const int imm8)

Synopsis

__m128d _mm_cmp_pd (__m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd xmm, xmm, xmm, imm
CPUID Flags: AVX

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 1 i := j*64 dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
vcmppd
__m256d _mm256_cmp_pd (__m256d a, __m256d b, const int imm8)

Synopsis

__m256d _mm256_cmp_pd (__m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 3 i := j*64 dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vcmppd
__mmask8 _mm_cmp_pd_mask (__m128d a, __m128d b, const int imm8)

Synopsis

__mmask8 _mm_cmp_pd_mask (__m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vcmppd
__mmask8 _mm_mask_cmp_pd_mask (__mmask8 k1, __m128d a, __m128d b, const int imm8)

Synopsis

__mmask8 _mm_mask_cmp_pd_mask (__mmask8 k1, __m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vcmppd
__mmask8 _mm256_cmp_pd_mask (__m256d a, __m256d b, const int imm8)

Synopsis

__mmask8 _mm256_cmp_pd_mask (__m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 3 i := j*64 k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 ENDFOR k[MAX:4] := 0
vcmppd
__mmask8 _mm256_mask_cmp_pd_mask (__mmask8 k1, __m256d a, __m256d b, const int imm8)

Synopsis

__mmask8 _mm256_mask_cmp_pd_mask (__mmask8 k1, __m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vcmppd
__mmask8 _mm512_cmp_pd_mask (__m512d a, __m512d b, const int imm8)

Synopsis

__mmask8 _mm512_cmp_pd_mask (__m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 7 i := j*64 k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 ENDFOR k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmp_pd_mask (__mmask8 k1, __m512d a, __m512d b, const int imm8)

Synopsis

__mmask8 _mm512_mask_cmp_pd_mask (__mmask8 k1, __m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vcmpps
__m128 _mm_cmp_ps (__m128 a, __m128 b, const int imm8)

Synopsis

__m128 _mm_cmp_ps (__m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps xmm, xmm, xmm, imm
CPUID Flags: AVX

Description

Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
vcmpps
__m256 _mm256_cmp_ps (__m256 a, __m256 b, const int imm8)

Synopsis

__m256 _mm256_cmp_ps (__m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 7 i := j*32 dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vcmpps
__mmask8 _mm_cmp_ps_mask (__m128 a, __m128 b, const int imm8)

Synopsis

__mmask8 _mm_cmp_ps_mask (__m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vcmpps
__mmask8 _mm_mask_cmp_ps_mask (__mmask8 k1, __m128 a, __m128 b, const int imm8)

Synopsis

__mmask8 _mm_mask_cmp_ps_mask (__mmask8 k1, __m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vcmpps
__mmask8 _mm256_cmp_ps_mask (__m256 a, __m256 b, const int imm8)

Synopsis

__mmask8 _mm256_cmp_ps_mask (__m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 7 i := j*32 k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 ENDFOR k[MAX:8] := 0
vcmpps
__mmask8 _mm256_mask_cmp_ps_mask (__mmask8 k1, __m256 a, __m256 b, const int imm8)

Synopsis

__mmask8 _mm256_mask_cmp_ps_mask (__mmask8 k1, __m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vcmpps
__mmask16 _mm512_cmp_ps_mask (__m512 a, __m512 b, const int imm8)

Synopsis

__mmask16 _mm512_cmp_ps_mask (__m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 15 i := j*32 k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 ENDFOR k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmp_ps_mask (__mmask16 k1, __m512 a, __m512 b, const int imm8)

Synopsis

__mmask16 _mm512_mask_cmp_ps_mask (__mmask16 k1, __m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vcmppd
__mmask8 _mm512_cmp_round_pd_mask (__m512d a, __m512d b, const int imm8, const int sae)

Synopsis

__mmask8 _mm512_cmp_round_pd_mask (__m512d a, __m512d b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm {sae}, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 7 i := j*64 k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 ENDFOR k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmp_round_pd_mask (__mmask8 k1, __m512d a, __m512d b, const int imm8, const int sae)

Synopsis

__mmask8 _mm512_mask_cmp_round_pd_mask (__mmask8 k1, __m512d a, __m512d b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm {sae}, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vcmpps
__mmask16 _mm512_cmp_round_ps_mask (__m512 a, __m512 b, const int imm8, const int sae)

Synopsis

__mmask16 _mm512_cmp_round_ps_mask (__m512 a, __m512 b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm {sae}, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 15 i := j*32 k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 ENDFOR k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmp_round_ps_mask (__mmask16 k1, __m512 a, __m512 b, const int imm8, const int sae)

Synopsis

__mmask16 _mm512_mask_cmp_round_ps_mask (__mmask16 k1, __m512 a, __m512 b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm {sae}, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vcmpsd
__mmask8 _mm_cmp_round_sd_mask (__m128d a, __m128d b, const int imm8, const int sae)

Synopsis

__mmask8 _mm_cmp_round_sd_mask (__m128d a, __m128d b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmpsd k {k}, xmm, xmm {sae}, imm
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 k[MAX:1] := 0
vcmpsd
__mmask8 _mm_mask_cmp_round_sd_mask (__mmask8 k1, __m128d a, __m128d b, const int imm8, const int sae)

Synopsis

__mmask8 _mm_mask_cmp_round_sd_mask (__mmask8 k1, __m128d a, __m128d b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmpsd k {k}, xmm, xmm {sae}, imm
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC IF k1[0] k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 ELSE k[0] := 0 FI k[MAX:1] := 0
vcmpss
__mmask8 _mm_cmp_round_ss_mask (__m128 a, __m128 b, const int imm8, const int sae)

Synopsis

__mmask8 _mm_cmp_round_ss_mask (__m128 a, __m128 b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmpss k {k}, xmm, xmm {sae}, imm
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 k[MAX:1] := 0
vcmpss
__mmask8 _mm_mask_cmp_round_ss_mask (__mmask8 k1, __m128 a, __m128 b, const int imm8, const int sae)

Synopsis

__mmask8 _mm_mask_cmp_round_ss_mask (__mmask8 k1, __m128 a, __m128 b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmpss k {k}, xmm, xmm {sae}, imm
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC IF k1[0] k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 ELSE k[0] := 0 FI k[MAX:1] := 0
vcmpsd
__m128d _mm_cmp_sd (__m128d a, __m128d b, const int imm8)

Synopsis

__m128d _mm_cmp_sd (__m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vcmpsd xmm, xmm, xmm, imm
CPUID Flags: AVX

Description

Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC dst[63:0] := ( a[63:0] OP b[63:0] ) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
vcmpsd
__mmask8 _mm_cmp_sd_mask (__m128d a, __m128d b, const int imm8)

Synopsis

__mmask8 _mm_cmp_sd_mask (__m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vcmpsd k {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 k[MAX:1] := 0
vcmpsd
__mmask8 _mm_mask_cmp_sd_mask (__mmask8 k1, __m128d a, __m128d b, const int imm8)

Synopsis

__mmask8 _mm_mask_cmp_sd_mask (__mmask8 k1, __m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vcmpsd k {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC IF k1[0] k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 ELSE k[0] := 0 FI k[MAX:1] := 0
vcmpss
__m128 _mm_cmp_ss (__m128 a, __m128 b, const int imm8)

Synopsis

__m128 _mm_cmp_ss (__m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpss xmm, xmm, xmm, imm
CPUID Flags: AVX

Description

Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC dst[31:0] := ( a[31:0] OP b[31:0] ) ? 0xFFFFFFFF : 0 dst[127:32] := a[127:32] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
vcmpss
__mmask8 _mm_cmp_ss_mask (__m128 a, __m128 b, const int imm8)

Synopsis

__mmask8 _mm_cmp_ss_mask (__m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpss k {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 k[MAX:1] := 0
vcmpss
__mmask8 _mm_mask_cmp_ss_mask (__mmask8 k1, __m128 a, __m128 b, const int imm8)

Synopsis

__mmask8 _mm_mask_cmp_ss_mask (__mmask8 k1, __m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpss k {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC IF k1[0] k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 ELSE k[0] := 0 FI k[MAX:1] := 0
pcmpeqw
__m128i _mm_cmpeq_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_cmpeq_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpeqw xmm, xmm
CPUID Flags: SSE2

Description

Compare packed 16-bit integers in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpcmpeqw
__m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpeqw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 16-bit integers in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpcmpw
__mmask8 _mm_cmpeq_epi16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpeq_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmpeq_epi16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpeq_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmpeq_epi16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_cmpeq_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmpeq_epi16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_cmpeq_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmpeq_epi16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_cmpeq_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmpeq_epi16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_cmpeq_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
pcmpeqd
__m128i _mm_cmpeq_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_cmpeq_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpeqd xmm, xmm
CPUID Flags: SSE2

Description

Compare packed 32-bit integers in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpcmpeqd
__m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpeqd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 32-bit integers in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpcmpd
__mmask8 _mm_cmpeq_epi32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpeq_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmpeq_epi32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpeq_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmpeq_epi32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpeq_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmpeq_epi32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpeq_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpeqd
__mmask16 _mm512_cmpeq_epi32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmpeq_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpeqd k {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpeqd
__mmask16 _mm512_mask_cmpeq_epi32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmpeq_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpeqd k {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
pcmpeqq
__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pcmpeqq xmm, xmm
CPUID Flags: SSE4.1

Description

Compare packed 64-bit integers in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpcmpeqq
__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpeqq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 64-bit integers in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpcmpq
__mmask8 _mm_cmpeq_epi64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpeq_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmpeq_epi64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpeq_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmpeq_epi64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpeq_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmpeq_epi64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpeq_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpeqq
__mmask8 _mm512_cmpeq_epi64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_cmpeq_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpeqq k {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpeqq
__mmask8 _mm512_mask_cmpeq_epi64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_cmpeq_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpeqq k {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
pcmpeqb
__m128i _mm_cmpeq_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_cmpeq_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpeqb xmm, xmm
CPUID Flags: SSE2

Description

Compare packed 8-bit integers in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpcmpeqb
__m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpeqb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 8-bit integers in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpcmpb
__mmask16 _mm_cmpeq_epi8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_cmpeq_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmpeq_epi8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_cmpeq_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmpeq_epi8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_cmpeq_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmpeq_epi8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_cmpeq_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmpeq_epi8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_cmpeq_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmpeq_epi8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_cmpeq_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmpeq_epu16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpeq_epu16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmpeq_epu16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpeq_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmpeq_epu16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_cmpeq_epu16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmpeq_epu16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_cmpeq_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmpeq_epu16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_cmpeq_epu16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmpeq_epu16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_cmpeq_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmpeq_epu32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpeq_epu32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmpeq_epu32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpeq_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmpeq_epu32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpeq_epu32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmpeq_epu32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpeq_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmpeq_epu32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmpeq_epu32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmpeq_epu32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmpeq_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmpeq_epu64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpeq_epu64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmpeq_epu64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpeq_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmpeq_epu64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpeq_epu64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmpeq_epu64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpeq_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmpeq_epu64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_cmpeq_epu64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmpeq_epu64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_cmpeq_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmpeq_epu8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_cmpeq_epu8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmpeq_epu8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_cmpeq_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmpeq_epu8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_cmpeq_epu8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmpeq_epu8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_cmpeq_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmpeq_epu8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_cmpeq_epu8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmpeq_epu8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_cmpeq_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
cmppd
__m128d _mm_cmpeq_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpeq_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] == b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcmppd
__mmask8 _mm512_cmpeq_pd_mask (__m512d a, __m512d b)

Synopsis

__mmask8 _mm512_cmpeq_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0 ENDFOR k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmpeq_pd_mask (__mmask8 k1, __m512d a, __m512d b)

Synopsis

__mmask8 _mm512_mask_cmpeq_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
cmpps
__m128 _mm_cmpeq_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpeq_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xffffffff : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
vcmpps
__mmask16 _mm512_cmpeq_ps_mask (__m512 a, __m512 b)

Synopsis

__mmask16 _mm512_cmpeq_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0 ENDFOR k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmpeq_ps_mask (__mmask16 k1, __m512 a, __m512 b)

Synopsis

__mmask16 _mm512_mask_cmpeq_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

y FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
cmpsd
__m128d _mm_cmpeq_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpeq_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b for equality, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := (a[63:0] == b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
cmpss
__m128 _mm_cmpeq_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpeq_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b for equality, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := ( a[31:0] == b[31:0] ) ? 0xffffffff : 0 dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
pcmpestri
int _mm_cmpestra (__m128i a, int la, __m128i b, int lb, const int imm8)

Synopsis

int _mm_cmpestra (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestri xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns 1 if b did not contain a null character and the resulting mask was zero, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 // compare all characters aInvalid := 0 bInvalid := 0 FOR i := 0 to UpperBound m := i*size FOR j := 0 to UpperBound n := j*size BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n]) // invalidate characters after EOS IF i == la aInvalid := 1 FI IF j == lb bInvalid := 1 FI // override comparisons for invalid characters CASE (imm8[3:2]) OF 0: // equal any IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 1: // ranges IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 2: // equal each IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI 3: // equal ordered IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 1 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI ESAC ENDFOR ENDFOR // aggregate results CASE (imm8[3:2]) OF 0: // equal any IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound IntRes1[i] := IntRes1[i] OR BoolRes[i][j] ENDFOR ENDFOR 1: // ranges IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound, j += 2 IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1]) ENDFOR ENDFOR 2: // equal each IntRes1 := 0 FOR i := 0 to UpperBound IntRes1[i] := BoolRes[i][i] ENDFOR 3: // equal ordered IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) FOR i := 0 to UpperBound k := i FOR j := 0 to UpperBound-i IntRes1[i] := IntRes1[i] AND BoolRes[k][j] k++ ENDFOR ENDFOR ESAC // optionally negate results FOR i := 0 to UpperBound IF imm8[4] IF imm8[5] // only negate valid IF i >= lb // invalid, don't negate IntRes2[i] := IntRes1[i] ELSE // valid, negate IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // negate all IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // don't negate IntRes2[i] := IntRes1[i] FI ENDFOR // output dst := (IntRes2 == 0) AND (lb > UpperBound)

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere72
Nehalem72
pcmpestri
int _mm_cmpestrc (__m128i a, int la, __m128i b, int lb, const int imm8)

Synopsis

int _mm_cmpestrc (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestri xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 // compare all characters aInvalid := 0 bInvalid := 0 FOR i := 0 to UpperBound m := i*size FOR j := 0 to UpperBound n := j*size BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n]) // invalidate characters after EOS IF i == la aInvalid := 1 FI IF j == lb bInvalid := 1 FI // override comparisons for invalid characters CASE (imm8[3:2]) OF 0: // equal any IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 1: // ranges IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 2: // equal each IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI 3: // equal ordered IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 1 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI ESAC ENDFOR ENDFOR // aggregate results CASE (imm8[3:2]) OF 0: // equal any IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound IntRes1[i] := IntRes1[i] OR BoolRes[i][j] ENDFOR ENDFOR 1: // ranges IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound, j += 2 IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1]) ENDFOR ENDFOR 2: // equal each IntRes1 := 0 FOR i := 0 to UpperBound IntRes1[i] := BoolRes[i][i] ENDFOR 3: // equal ordered IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) FOR i := 0 to UpperBound k := i FOR j := 0 to UpperBound-i IntRes1[i] := IntRes1[i] AND BoolRes[k][j] k++ ENDFOR ENDFOR ESAC // optionally negate results FOR i := 0 to UpperBound IF imm8[4] IF imm8[5] // only negate valid IF i >= lb // invalid, don't negate IntRes2[i] := IntRes1[i] ELSE // valid, negate IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // negate all IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // don't negate IntRes2[i] := IntRes1[i] FI ENDFOR // output dst := (IntRes2 != 0)

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere72
Nehalem72
pcmpestri
int _mm_cmpestri (__m128i a, int la, __m128i b, int lb, const int imm8)

Synopsis

int _mm_cmpestri (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestri xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings in a and b with lengths la and lb using the control in imm8, and store the generated index in dst.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 // compare all characters aInvalid := 0 bInvalid := 0 FOR i := 0 to UpperBound m := i*size FOR j := 0 to UpperBound n := j*size BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n]) // invalidate characters after EOS IF i == la aInvalid := 1 FI IF j == lb bInvalid := 1 FI // override comparisons for invalid characters CASE (imm8[3:2]) OF 0: // equal any IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 1: // ranges IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 2: // equal each IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI 3: // equal ordered IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 1 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI ESAC ENDFOR ENDFOR // aggregate results CASE (imm8[3:2]) OF 0: // equal any IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound IntRes1[i] := IntRes1[i] OR BoolRes[i][j] ENDFOR ENDFOR 1: // ranges IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound, j += 2 IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1]) ENDFOR ENDFOR 2: // equal each IntRes1 := 0 FOR i := 0 to UpperBound IntRes1[i] := BoolRes[i][i] ENDFOR 3: // equal ordered IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) FOR i := 0 to UpperBound k := i FOR j := 0 to UpperBound-i IntRes1[i] := IntRes1[i] AND BoolRes[k][j] k++ ENDFOR ENDFOR ESAC // optionally negate results FOR i := 0 to UpperBound IF imm8[4] IF imm8[5] // only negate valid IF i >= lb // invalid, don't negate IntRes2[i] := IntRes1[i] ELSE // valid, negate IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // negate all IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // don't negate IntRes2[i] := IntRes1[i] FI ENDFOR // output IF imm8[6] // most significant bit tmp := UpperBound dst := tmp DO WHILE ((tmp >= 0) AND a[tmp] = 0) tmp := tmp - 1 dst := tmp OD ELSE // least significant bit tmp := 0 dst := tmp DO WHILE ((tmp <= UpperBound) AND a[tmp] = 0) tmp := tmp + 1 dst := tmp OD FI

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere72
Nehalem72
pcmpestrm
__m128i _mm_cmpestrm (__m128i a, int la, __m128i b, int lb, const int imm8)

Synopsis

__m128i _mm_cmpestrm (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestrm xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings in a and b with lengths la and lb using the control in imm8, and store the generated mask in dst.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 // compare all characters aInvalid := 0 bInvalid := 0 FOR i := 0 to UpperBound m := i*size FOR j := 0 to UpperBound n := j*size BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n]) // invalidate characters after EOS IF i == la aInvalid := 1 FI IF j == lb bInvalid := 1 FI // override comparisons for invalid characters CASE (imm8[3:2]) OF 0: // equal any IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 1: // ranges IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 2: // equal each IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI 3: // equal ordered IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 1 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI ESAC ENDFOR ENDFOR // aggregate results CASE (imm8[3:2]) OF 0: // equal any IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound IntRes1[i] := IntRes1[i] OR BoolRes[i][j] ENDFOR ENDFOR 1: // ranges IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound, j += 2 IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1]) ENDFOR ENDFOR 2: // equal each IntRes1 := 0 FOR i := 0 to UpperBound IntRes1[i] := BoolRes[i][i] ENDFOR 3: // equal ordered IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) FOR i := 0 to UpperBound k := i FOR j := 0 to UpperBound-i IntRes1[i] := IntRes1[i] AND BoolRes[k][j] k++ ENDFOR ENDFOR ESAC // optionally negate results FOR i := 0 to UpperBound IF imm8[4] IF imm8[5] // only negate valid IF i >= lb // invalid, don't negate IntRes2[i] := IntRes1[i] ELSE // valid, negate IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // negate all IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // don't negate IntRes2[i] := IntRes1[i] FI ENDFOR // output IF imm8[6] // byte / word mask FOR i := 0 to UpperBound j := i*size IF IntRes2[i] dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF) ELSE dst[j+size-1:j] := 0 FI ENDFOR ELSE // bit mask dst[UpperBound:0] := IntRes[UpperBound:0] dst[127:UpperBound+1] := 0 FI

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere82
Nehalem82
pcmpestri
int _mm_cmpestro (__m128i a, int la, __m128i b, int lb, const int imm8)

Synopsis

int _mm_cmpestro (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestri xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns bit 0 of the resulting bit mask.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 // compare all characters aInvalid := 0 bInvalid := 0 FOR i := 0 to UpperBound m := i*size FOR j := 0 to UpperBound n := j*size BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n]) // invalidate characters after EOS IF i == la aInvalid := 1 FI IF j == lb bInvalid := 1 FI // override comparisons for invalid characters CASE (imm8[3:2]) OF 0: // equal any IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 1: // ranges IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 2: // equal each IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI 3: // equal ordered IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 1 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI ESAC ENDFOR ENDFOR // aggregate results CASE (imm8[3:2]) OF 0: // equal any IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound IntRes1[i] := IntRes1[i] OR BoolRes[i][j] ENDFOR ENDFOR 1: // ranges IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound, j += 2 IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1]) ENDFOR ENDFOR 2: // equal each IntRes1 := 0 FOR i := 0 to UpperBound IntRes1[i] := BoolRes[i][i] ENDFOR 3: // equal ordered IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) FOR i := 0 to UpperBound k := i FOR j := 0 to UpperBound-i IntRes1[i] := IntRes1[i] AND BoolRes[k][j] k++ ENDFOR ENDFOR ESAC // optionally negate results FOR i := 0 to UpperBound IF imm8[4] IF imm8[5] // only negate valid IF i >= lb // invalid, don't negate IntRes2[i] := IntRes1[i] ELSE // valid, negate IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // negate all IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // don't negate IntRes2[i] := IntRes1[i] FI ENDFOR // output dst := IntRes2[0

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere72
Nehalem72
pcmpestri
int _mm_cmpestrs (__m128i a, int la, __m128i b, int lb, const int imm8)

Synopsis

int _mm_cmpestrs (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestri xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns 1 if any character in a was null, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 dst := (la <= UpperBound)

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere72
Nehalem72
pcmpestri
int _mm_cmpestrz (__m128i a, int la, __m128i b, int lb, const int imm8)

Synopsis

int _mm_cmpestrz (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestri xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns 1 if any character in b was null, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 dst := (lb <= UpperBound)

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere72
Nehalem72
vpcmpw
__mmask8 _mm_cmpge_epi16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpge_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmpge_epi16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpge_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmpge_epi16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_cmpge_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmpge_epi16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_cmpge_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmpge_epi16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_cmpge_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmpge_epi16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_cmpge_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpd
__mmask8 _mm_cmpge_epi32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpge_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmpge_epi32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpge_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmpge_epi32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpge_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmpge_epi32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpge_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpd
__mmask16 _mm512_cmpge_epi32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmpge_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpd
__mmask16 _mm512_mask_cmpge_epi32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmpge_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpq
__mmask8 _mm_cmpge_epi64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpge_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmpge_epi64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpge_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmpge_epi64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpge_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmpge_epi64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpge_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpq
__mmask8 _mm512_cmpge_epi64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_cmpge_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpq
__mmask8 _mm512_mask_cmpge_epi64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_cmpge_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpb
__mmask16 _mm_cmpge_epi8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_cmpge_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmpge_epi8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_cmpge_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmpge_epi8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_cmpge_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmpge_epi8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_cmpge_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmpge_epi8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_cmpge_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmpge_epi8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_cmpge_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmpge_epu16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpge_epu16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmpge_epu16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpge_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmpge_epu16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_cmpge_epu16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmpge_epu16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_cmpge_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmpge_epu16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_cmpge_epu16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmpge_epu16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_cmpge_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmpge_epu32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpge_epu32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmpge_epu32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpge_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmpge_epu32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpge_epu32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmpge_epu32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpge_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmpge_epu32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmpge_epu32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmpge_epu32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmpge_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmpge_epu64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpge_epu64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmpge_epu64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpge_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmpge_epu64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpge_epu64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmpge_epu64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpge_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmpge_epu64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_cmpge_epu64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmpge_epu64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_cmpge_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmpge_epu8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_cmpge_epu8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmpge_epu8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_cmpge_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmpge_epu8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_cmpge_epu8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmpge_epu8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_cmpge_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmpge_epu8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_cmpge_epu8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmpge_epu8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_cmpge_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
cmppd
__m128d _mm_cmpge_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpge_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for greater-than-or-equal, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] >= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
cmpps
__m128 _mm_cmpge_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpge_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than-or-equal, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] >= b[i+31:i] ) ? 0xffffffff : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
cmpsd
__m128d _mm_cmpge_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpge_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b for greater-than-or-equal, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := (a[63:0] >= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
cmpss
__m128 _mm_cmpge_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpge_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b for greater-than-or-equal, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := ( a[31:0] >= b[31:0] ) ? 0xffffffff : 0 dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
pcmpgtw
__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpgtw xmm, xmm
CPUID Flags: SSE2

Description

Compare packed 16-bit integers in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpcmpgtw
__m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpgtw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 16-bit integers in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpcmpw
__mmask8 _mm_cmpgt_epi16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpgt_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmpgt_epi16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpgt_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] >== b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmpgt_epi16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_cmpgt_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmpgt_epi16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_cmpgt_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmpgt_epi16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_cmpgt_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmpgt_epi16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_cmpgt_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
pcmpgtd
__m128i _mm_cmpgt_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_cmpgt_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpgtd xmm, xmm
CPUID Flags: SSE2

Description

Compare packed 32-bit integers in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpcmpgtd
__m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpgtd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 32-bit integers in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpcmpd
__mmask8 _mm_cmpgt_epi32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpgt_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmpgt_epi32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpgt_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmpgt_epi32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpgt_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmpgt_epi32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpgt_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpgtd
__mmask16 _mm512_cmpgt_epi32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmpgt_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpgtd k {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpgtd
__mmask16 _mm512_mask_cmpgt_epi32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmpgt_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpgtd k {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
pcmpgtq
__m128i _mm_cmpgt_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_cmpgt_epi64 (__m128i a, __m128i b)
#include "nmmintrin.h"
Instruction: pcmpgtq xmm, xmm
CPUID Flags: SSE4.2

Description

Compare packed 64-bit integers in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge51
Sandy Bridge51
Westmere31
Nehalem31
vpcmpgtq
__m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpgtq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 64-bit integers in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell51
vpcmpq
__mmask8 _mm_cmpgt_epi64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpgt_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmpgt_epi64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpgt_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmpgt_epi64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpgt_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmpgt_epi64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpgt_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpgtq
__mmask8 _mm512_cmpgt_epi64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_cmpgt_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpgtq k {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpgtq
__mmask8 _mm512_mask_cmpgt_epi64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_cmpgt_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpgtq k {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
pcmpgtb
__m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpgtb xmm, xmm
CPUID Flags: SSE2

Description

Compare packed 8-bit integers in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpcmpgtb
__m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpgtb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 8-bit integers in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpcmpb
__mmask16 _mm_cmpgt_epi8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_cmpgt_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmpgt_epi8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_cmpgt_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmpgt_epi8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_cmpgt_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmpgt_epi8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_cmpgt_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmpgt_epi8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_cmpgt_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmpgt_epi8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_cmpgt_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmpgt_epu16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpgt_epu16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmpgt_epu16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpgt_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] >== b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmpgt_epu16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_cmpgt_epu16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmpgt_epu16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_cmpgt_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmpgt_epu16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_cmpgt_epu16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmpgt_epu16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_cmpgt_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmpgt_epu32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpgt_epu32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmpgt_epu32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpgt_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmpgt_epu32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpgt_epu32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmpgt_epu32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpgt_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmpgt_epu32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmpgt_epu32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmpgt_epu32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmpgt_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmpgt_epu64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpgt_epu64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmpgt_epu64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpgt_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmpgt_epu64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpgt_epu64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmpgt_epu64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpgt_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmpgt_epu64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_cmpgt_epu64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmpgt_epu64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_cmpgt_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmpgt_epu8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_cmpgt_epu8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmpgt_epu8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_cmpgt_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmpgt_epu8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_cmpgt_epu8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmpgt_epu8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_cmpgt_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmpgt_epu8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_cmpgt_epu8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmpgt_epu8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_cmpgt_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
cmppd
__m128d _mm_cmpgt_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpgt_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] > b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
cmpps
__m128 _mm_cmpgt_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpgt_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xffffffff : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
cmpsd
__m128d _mm_cmpgt_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpgt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b for greater-than, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := (a[63:0] > b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
cmpss
__m128 _mm_cmpgt_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpgt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b for greater-than, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := ( a[31:0] > b[31:0] ) ? 0xffffffff : 0 dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
pcmpistri
int _mm_cmpistra (__m128i a, __m128i b, const int imm8)

Synopsis

int _mm_cmpistra (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistri xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings with implicit lengths in a and b using the control in imm8, and returns 1 if b did not contain a null character and the resulting mask was zero, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 // compare all characters aInvalid := 0 bInvalid := 0 FOR i := 0 to UpperBound m := i*size FOR j := 0 to UpperBound n := j*size BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n]) // invalidate characters after EOS IF a[m+size-1:m] == 0 aInvalid := 1 FI IF b[n+size-1:n] == 0 bInvalid := 1 FI // override comparisons for invalid characters CASE (imm8[3:2]) OF 0: // equal any IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 1: // ranges IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 2: // equal each IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI 3: // equal ordered IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 1 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI ESAC ENDFOR ENDFOR // aggregate results CASE (imm8[3:2]) OF 0: // equal any IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound IntRes1[i] := IntRes1[i] OR BoolRes[i][j] ENDFOR ENDFOR 1: // ranges IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound, j += 2 IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1]) ENDFOR ENDFOR 2: // equal each IntRes1 := 0 FOR i := 0 to UpperBound IntRes1[i] := BoolRes[i][i] ENDFOR 3: // equal ordered IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) FOR i := 0 to UpperBound k := i FOR j := 0 to UpperBound-i IntRes1[i] := IntRes1[i] AND BoolRes[k][j] k++ ENDFOR ENDFOR ESAC // optionally negate results bInvalid := 0 FOR i := 0 to UpperBound IF imm8[4] IF imm8[5] // only negate valid IF b[n+size-1:n] == 0 bInvalid := 1 FI IF bInvalid // invalid, don't negate IntRes2[i] := IntRes1[i] ELSE // valid, negate IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // negate all IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // don't negate IntRes2[i] := IntRes1[i] FI ENDFOR // output dst := (IntRes2 == 0) AND bInvalid

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere72
Nehalem72
pcmpistri
int _mm_cmpistrc (__m128i a, __m128i b, const int imm8)

Synopsis

int _mm_cmpistrc (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistri xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings with implicit lengths in a and b using the control in imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 // compare all characters aInvalid := 0 bInvalid := 0 FOR i := 0 to UpperBound m := i*size FOR j := 0 to UpperBound n := j*size BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n]) // invalidate characters after EOS IF a[m+size-1:m] == 0 aInvalid := 1 FI IF b[n+size-1:n] == 0 bInvalid := 1 FI // override comparisons for invalid characters CASE (imm8[3:2]) OF 0: // equal any IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 1: // ranges IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 2: // equal each IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI 3: // equal ordered IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 1 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI ESAC ENDFOR ENDFOR // aggregate results CASE (imm8[3:2]) OF 0: // equal any IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound IntRes1[i] := IntRes1[i] OR BoolRes[i][j] ENDFOR ENDFOR 1: // ranges IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound, j += 2 IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1]) ENDFOR ENDFOR 2: // equal each IntRes1 := 0 FOR i := 0 to UpperBound IntRes1[i] := BoolRes[i][i] ENDFOR 3: // equal ordered IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) FOR i := 0 to UpperBound k := i FOR j := 0 to UpperBound-i IntRes1[i] := IntRes1[i] AND BoolRes[k][j] k++ ENDFOR ENDFOR ESAC // optionally negate results bInvalid := 0 FOR i := 0 to UpperBound IF imm8[4] IF imm8[5] // only negate valid IF b[n+size-1:n] == 0 bInvalid := 1 FI IF bInvalid // invalid, don't negate IntRes2[i] := IntRes1[i] ELSE // valid, negate IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // negate all IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // don't negate IntRes2[i] := IntRes1[i] FI ENDFOR // output dst := (IntRes2 != 0)

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere72
Nehalem72
pcmpistri
int _mm_cmpistri (__m128i a, __m128i b, const int imm8)

Synopsis

int _mm_cmpistri (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistri xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings with implicit lengths in a and b using the control in imm8, and store the generated index in dst.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 // compare all characters aInvalid := 0 bInvalid := 0 FOR i := 0 to UpperBound m := i*size FOR j := 0 to UpperBound n := j*size BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n]) // invalidate characters after EOS IF a[m+size-1:m] == 0 aInvalid := 1 FI IF b[n+size-1:n] == 0 bInvalid := 1 FI // override comparisons for invalid characters CASE (imm8[3:2]) OF 0: // equal any IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 1: // ranges IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 2: // equal each IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI 3: // equal ordered IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 1 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI ESAC ENDFOR ENDFOR // aggregate results CASE (imm8[3:2]) OF 0: // equal any IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound IntRes1[i] := IntRes1[i] OR BoolRes[i][j] ENDFOR ENDFOR 1: // ranges IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound, j += 2 IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1]) ENDFOR ENDFOR 2: // equal each IntRes1 := 0 FOR i := 0 to UpperBound IntRes1[i] := BoolRes[i][i] ENDFOR 3: // equal ordered IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) FOR i := 0 to UpperBound k := i FOR j := 0 to UpperBound-i IntRes1[i] := IntRes1[i] AND BoolRes[k][j] k++ ENDFOR ENDFOR ESAC // optionally negate results bInvalid := 0 FOR i := 0 to UpperBound IF imm8[4] IF imm8[5] // only negate valid IF b[n+size-1:n] == 0 bInvalid := 1 FI IF bInvalid // invalid, don't negate IntRes2[i] := IntRes1[i] ELSE // valid, negate IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // negate all IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // don't negate IntRes2[i] := IntRes1[i] FI ENDFOR // output IF imm8[6] // most significant bit tmp := UpperBound dst := tmp DO WHILE ((tmp >= 0) AND a[tmp] = 0) tmp := tmp - 1 dst := tmp OD ELSE // least significant bit tmp := 0 dst := tmp DO WHILE ((tmp <= UpperBound) AND a[tmp] = 0) tmp := tmp + 1 dst := tmp OD FI

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere72
Nehalem72
pcmpistrm
__m128i _mm_cmpistrm (__m128i a, __m128i b, const int imm8)

Synopsis

__m128i _mm_cmpistrm (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistrm xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings with implicit lengths in a and b using the control in imm8, and store the generated mask in dst.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 // compare all characters aInvalid := 0 bInvalid := 0 FOR i := 0 to UpperBound m := i*size FOR j := 0 to UpperBound n := j*size BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n]) // invalidate characters after EOS IF a[m+size-1:m] == 0 aInvalid := 1 FI IF b[n+size-1:n] == 0 bInvalid := 1 FI // override comparisons for invalid characters CASE (imm8[3:2]) OF 0: // equal any IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 1: // ranges IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 2: // equal each IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI 3: // equal ordered IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 1 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI ESAC ENDFOR ENDFOR // aggregate results CASE (imm8[3:2]) OF 0: // equal any IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound IntRes1[i] := IntRes1[i] OR BoolRes[i][j] ENDFOR ENDFOR 1: // ranges IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound, j += 2 IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1]) ENDFOR ENDFOR 2: // equal each IntRes1 := 0 FOR i := 0 to UpperBound IntRes1[i] := BoolRes[i][i] ENDFOR 3: // equal ordered IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) FOR i := 0 to UpperBound k := i FOR j := 0 to UpperBound-i IntRes1[i] := IntRes1[i] AND BoolRes[k][j] k++ ENDFOR ENDFOR ESAC // optionally negate results bInvalid := 0 FOR i := 0 to UpperBound IF imm8[4] IF imm8[5] // only negate valid IF b[n+size-1:n] == 0 bInvalid := 1 FI IF bInvalid // invalid, don't negate IntRes2[i] := IntRes1[i] ELSE // valid, negate IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // negate all IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // don't negate IntRes2[i] := IntRes1[i] FI ENDFOR // output IF imm8[6] // byte / word mask FOR i := 0 to UpperBound j := i*size IF IntRes2[i] dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF) ELSE dst[j+size-1:j] := 0 FI ENDFOR ELSE // bit mask dst[UpperBound:0] := IntRes[UpperBound:0] dst[127:UpperBound+1] := 0 FI

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere82
Nehalem82
pcmpistri
int _mm_cmpistro (__m128i a, __m128i b, const int imm8)

Synopsis

int _mm_cmpistro (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistri xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings with implicit lengths in a and b using the control in imm8, and returns bit 0 of the resulting bit mask.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 // compare all characters aInvalid := 0 bInvalid := 0 FOR i := 0 to UpperBound m := i*size FOR j := 0 to UpperBound n := j*size BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n]) // invalidate characters after EOS IF a[m+size-1:m] == 0 aInvalid := 1 FI IF b[n+size-1:n] == 0 bInvalid := 1 FI // override comparisons for invalid characters CASE (imm8[3:2]) OF 0: // equal any IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 1: // ranges IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 0 FI 2: // equal each IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 0 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI 3: // equal ordered IF (!aInvalid && bInvalid) BoolRes[i][j] := 0 ELSE IF (aInvalid && !bInvalid) BoolRes[i][j] := 1 ELSE If (aInvalid && bInvalid) BoolRes[i][j] := 1 FI ESAC ENDFOR ENDFOR // aggregate results CASE (imm8[3:2]) OF 0: // equal any IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound IntRes1[i] := IntRes1[i] OR BoolRes[i][j] ENDFOR ENDFOR 1: // ranges IntRes1 := 0 FOR i := 0 to UpperBound FOR j := 0 to UpperBound, j += 2 IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1]) ENDFOR ENDFOR 2: // equal each IntRes1 := 0 FOR i := 0 to UpperBound IntRes1[i] := BoolRes[i][i] ENDFOR 3: // equal ordered IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) FOR i := 0 to UpperBound k := i FOR j := 0 to UpperBound-i IntRes1[i] := IntRes1[i] AND BoolRes[k][j] k++ ENDFOR ENDFOR ESAC // optionally negate results bInvalid := 0 FOR i := 0 to UpperBound IF imm8[4] IF imm8[5] // only negate valid IF b[n+size-1:n] == 0 bInvalid := 1 FI IF bInvalid // invalid, don't negate IntRes2[i] := IntRes1[i] ELSE // valid, negate IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // negate all IntRes2[i] := -1 XOR IntRes1[i] FI ELSE // don't negate IntRes2[i] := IntRes1[i] FI ENDFOR // output dst := IntRes2[0]

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere72
Nehalem72
pcmpistri
int _mm_cmpistrs (__m128i a, __m128i b, const int imm8)

Synopsis

int _mm_cmpistrs (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistri xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings with implicit lengths in a and b using the control in imm8, and returns 1 if any character in a was null, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 aInvalid := 0 FOR i := 0 to UpperBound m := i*size IF b[m+size-1:m] == 0 aInvalid := 1 FI ENDFOR dst := aInvalid

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere72
Nehalem72
pcmpistri
int _mm_cmpistrz (__m128i a, __m128i b, const int imm8)

Synopsis

int _mm_cmpistrz (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistri xmm, xmm, imm
CPUID Flags: SSE4.2

Description

Compare packed strings with implicit lengths in a and b using the control in imm8, and returns 1 if any character in b was null, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters _SIDD_UWORD_OPS // unsigned 16-bit characters _SIDD_SBYTE_OPS // signed 8-bit characters _SIDD_SWORD_OPS // signed 16-bit characters _SIDD_CMP_EQUAL_ANY // compare equal any _SIDD_CMP_RANGES // compare ranges _SIDD_CMP_EQUAL_EACH // compare equal each _SIDD_CMP_EQUAL_ORDERED // compare equal ordered _SIDD_NEGATIVE_POLARITY // negate results _SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string _SIDD_LEAST_SIGNIFICANT // index only: return last significant bit _SIDD_MOST_SIGNIFICANT // index only: return most significant bit _SIDD_BIT_MASK // mask only: return bit mask _SIDD_UNIT_MASK // mask only: return byte/word mask

Operation

size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 bInvalid := 0 FOR j := 0 to UpperBound n := j*size IF b[n+size-1:n] == 0 bInvalid := 1 FI ENDFOR dst := bInvalid

Performance

ArchitectureLatencyThroughput
Haswell113
Ivy Bridge113
Sandy Bridge113
Westmere72
Nehalem72
vpcmpw
__mmask8 _mm_cmple_epi16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmple_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmple_epi16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmple_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmple_epi16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_cmple_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmple_epi16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_cmple_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmple_epi16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_cmple_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmple_epi16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_cmple_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpd
__mmask8 _mm_cmple_epi32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmple_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmple_epi32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmple_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmple_epi32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmple_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmple_epi32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmple_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpd
__mmask16 _mm512_cmple_epi32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmple_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpd
__mmask16 _mm512_mask_cmple_epi32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmple_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpq
__mmask8 _mm_cmple_epi64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmple_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmple_epi64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmple_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmple_epi64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmple_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmple_epi64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmple_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpq
__mmask8 _mm512_cmple_epi64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_cmple_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpq
__mmask8 _mm512_mask_cmple_epi64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_cmple_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpb
__mmask16 _mm_cmple_epi8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_cmple_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmple_epi8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_cmple_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmple_epi8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_cmple_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmple_epi8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_cmple_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmple_epi8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_cmple_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmple_epi8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_cmple_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmple_epu16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmple_epu16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmple_epu16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmple_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmple_epu16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_cmple_epu16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmple_epu16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_cmple_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmple_epu16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_cmple_epu16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmple_epu16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_cmple_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmple_epu32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmple_epu32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmple_epu32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmple_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmple_epu32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmple_epu32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmple_epu32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmple_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmple_epu32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmple_epu32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmple_epu32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmple_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmple_epu64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmple_epu64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmple_epu64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmple_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmple_epu64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmple_epu64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmple_epu64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmple_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmple_epu64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_cmple_epu64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmple_epu64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_cmple_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmple_epu8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_cmple_epu8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmple_epu8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_cmple_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmple_epu8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_cmple_epu8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmple_epu8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_cmple_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmple_epu8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_cmple_epu8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmple_epu8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_cmple_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
cmppd
__m128d _mm_cmple_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmple_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] <= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcmppd
__mmask8 _mm512_cmple_pd_mask (__m512d a, __m512d b)

Synopsis

__mmask8 _mm512_cmple_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0 ENDFOR k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmple_pd_mask (__mmask8 k1, __m512d a, __m512d b)

Synopsis

__mmask8 _mm512_mask_cmple_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
cmpps
__m128 _mm_cmple_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmple_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] <= b[i+31:i] ) ? 0xffffffff : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
vcmpps
__mmask16 _mm512_cmple_ps_mask (__m512 a, __m512 b)

Synopsis

__mmask16 _mm512_cmple_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0 ENDFOR k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmple_ps_mask (__mmask16 k1, __m512 a, __m512 b)

Synopsis

__mmask16 _mm512_mask_cmple_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
cmpsd
__m128d _mm_cmple_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmple_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := (a[63:0] <= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
cmpss
__m128 _mm_cmple_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmple_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := ( a[31:0] <= b[31:0] ) ? 0xffffffff : 0 dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
pcmpgtw
__m128i _mm_cmplt_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_cmplt_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpgtw xmm, xmm
CPUID Flags: SSE2

Description

Compare packed 16-bit integers in a and b for less-than, and store the results in dst. Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := ( a[i+15:i] < b[i+15:i] ) ? 0xFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpcmpw
__mmask8 _mm_cmplt_epi16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmplt_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmplt_epi16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmplt_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmplt_epi16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_cmplt_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmplt_epi16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_cmplt_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmplt_epi16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_cmplt_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmplt_epi16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_cmplt_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
pcmpgtd
__m128i _mm_cmplt_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_cmplt_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpgtd xmm, xmm
CPUID Flags: SSE2

Description

Compare packed 32-bit integers in a and b for less-than, and store the results in dst. Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpcmpd
__mmask8 _mm_cmplt_epi32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmplt_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmplt_epi32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmplt_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmplt_epi32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmplt_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmplt_epi32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmplt_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpltd
__mmask16 _mm512_cmplt_epi32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmplt_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpltd k {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpd
__mmask16 _mm512_cmplt_epi32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmplt_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpd
__mmask16 _mm512_mask_cmplt_epi32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmplt_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpltd
__mmask16 _mm512_mask_cmplt_epi32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmplt_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpltd k {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpq
__mmask8 _mm_cmplt_epi64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmplt_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmplt_epi64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmplt_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmplt_epi64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmplt_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmplt_epi64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmplt_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpq
__mmask8 _mm512_cmplt_epi64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_cmplt_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpq
__mmask8 _mm512_mask_cmplt_epi64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_cmplt_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
pcmpgtb
__m128i _mm_cmplt_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_cmplt_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpgtb xmm, xmm
CPUID Flags: SSE2

Description

Compare packed 8-bit integers in a and b for less-than, and store the results in dst. Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := ( a[i+7:i] < b[i+7:i] ) ? 0xFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpcmpb
__mmask16 _mm_cmplt_epi8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_cmplt_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmplt_epi8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_cmplt_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmplt_epi8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_cmplt_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmplt_epi8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_cmplt_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmplt_epi8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_cmplt_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmplt_epi8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_cmplt_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmplt_epu16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmplt_epu16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmplt_epu16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmplt_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmplt_epu16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_cmplt_epu16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmplt_epu16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_cmplt_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmplt_epu16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_cmplt_epu16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmplt_epu16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_cmplt_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmplt_epu32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmplt_epu32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmplt_epu32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmplt_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmplt_epu32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmplt_epu32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmplt_epu32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmplt_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmplt_epu32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmplt_epu32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmplt_epu32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmplt_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmplt_epu64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmplt_epu64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmplt_epu64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmplt_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmplt_epu64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmplt_epu64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmplt_epu64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmplt_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmplt_epu64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_cmplt_epu64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmplt_epu64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_cmplt_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmplt_epu8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_cmplt_epu8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmplt_epu8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_cmplt_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmplt_epu8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_cmplt_epu8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmplt_epu8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_cmplt_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmplt_epu8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_cmplt_epu8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmplt_epu8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_cmplt_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
cmppd
__m128d _mm_cmplt_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmplt_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] < b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcmppd
__mmask8 _mm512_cmplt_pd_mask (__m512d a, __m512d b)

Synopsis

__mmask8 _mm512_cmplt_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0 ENDFOR k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmplt_pd_mask (__mmask8 k1, __m512d a, __m512d b)

Synopsis

__mmask8 _mm512_mask_cmplt_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
cmpps
__m128 _mm_cmplt_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmplt_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xffffffff : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
vcmpps
__mmask16 _mm512_cmplt_ps_mask (__m512 a, __m512 b)

Synopsis

__mmask16 _mm512_cmplt_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0 ENDFOR k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmplt_ps_mask (__mmask16 k1, __m512 a, __m512 b)

Synopsis

__mmask16 _mm512_mask_cmplt_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
cmpsd
__m128d _mm_cmplt_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmplt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b for less-than, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := (a[63:0] < b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
cmpss
__m128 _mm_cmplt_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmplt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b for less-than, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := ( a[31:0] < b[31:0] ) ? 0xffffffff : 0 dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
vpcmpw
__mmask8 _mm_cmpneq_epi16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpneq_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmpneq_epi16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpneq_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmpneq_epi16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_cmpneq_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmpneq_epi16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_cmpneq_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmpneq_epi16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_cmpneq_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmpneq_epi16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_cmpneq_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpd
__mmask8 _mm_cmpneq_epi32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpneq_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmpneq_epi32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpneq_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmpneq_epi32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpneq_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmpneq_epi32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpneq_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpd
__mmask16 _mm512_cmpneq_epi32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmpneq_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpd
__mmask16 _mm512_mask_cmpneq_epi32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmpneq_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpq
__mmask8 _mm_cmpneq_epi64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpneq_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmpneq_epi64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpneq_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmpneq_epi64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpneq_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmpneq_epi64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpneq_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpq
__mmask8 _mm512_cmpneq_epi64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_cmpneq_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpq
__mmask8 _mm512_mask_cmpneq_epi64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_cmpneq_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpb
__mmask16 _mm_cmpneq_epi8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_cmpneq_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmpneq_epi8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_cmpneq_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmpneq_epi8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_cmpneq_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmpneq_epi8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_cmpneq_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmpneq_epi8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_cmpneq_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmpneq_epi8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_cmpneq_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmpneq_epu16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpneq_epu16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmpneq_epu16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpneq_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmpneq_epu16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_cmpneq_epu16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmpneq_epu16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_cmpneq_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmpneq_epu16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_cmpneq_epu16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmpneq_epu16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_cmpneq_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmpneq_epu32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpneq_epu32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmpneq_epu32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpneq_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmpneq_epu32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpneq_epu32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmpneq_epu32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpneq_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmpneq_epu32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_cmpneq_epu32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmpneq_epu32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_cmpneq_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmpneq_epu64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_cmpneq_epu64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmpneq_epu64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_cmpneq_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmpneq_epu64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_cmpneq_epu64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmpneq_epu64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_cmpneq_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmpneq_epu64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_cmpneq_epu64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmpneq_epu64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_cmpneq_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmpneq_epu8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_cmpneq_epu8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmpneq_epu8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_cmpneq_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmpneq_epu8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_cmpneq_epu8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmpneq_epu8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_cmpneq_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmpneq_epu8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_cmpneq_epu8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmpneq_epu8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_cmpneq_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
cmppd
__m128d _mm_cmpneq_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpneq_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] != b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcmppd
__mmask8 _mm512_cmpneq_pd_mask (__m512d a, __m512d b)

Synopsis

__mmask8 _mm512_cmpneq_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0 ENDFOR k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmpneq_pd_mask (__mmask8 k1, __m512d a, __m512d b)

Synopsis

__mmask8 _mm512_mask_cmpneq_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
cmpps
__m128 _mm_cmpneq_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpneq_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] != b[i+31:i] ) ? 0xffffffff : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
vcmpps
__mmask16 _mm512_cmpneq_ps_mask (__m512 a, __m512 b)

Synopsis

__mmask16 _mm512_cmpneq_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0 ENDFOR k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmpneq_ps_mask (__mmask16 k1, __m512 a, __m512 b)

Synopsis

__mmask16 _mm512_mask_cmpneq_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
cmpsd
__m128d _mm_cmpneq_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpneq_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b for not-equal, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := (a[63:0] != b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
cmpss
__m128 _mm_cmpneq_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpneq_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b for not-equal, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := ( a[31:0] != b[31:0] ) ? 0xffffffff : 0 dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
cmppd
__m128d _mm_cmpnge_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpnge_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for not-greater-than-or-equal, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := !(a[i+63:i] >= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
cmpps
__m128 _mm_cmpnge_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for not-greater-than-or-equal, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := !( a[i+31:i] >= b[i+31:i] ) ? 0xffffffff : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
cmpsd
__m128d _mm_cmpnge_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpnge_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b for not-greater-than-or-equal, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := !(a[63:0] >= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
cmpss
__m128 _mm_cmpnge_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpnge_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b for not-greater-than-or-equal, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := !( a[31:0] >= b[31:0] ) ? 0xffffffff : 0 dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
cmppd
__m128d _mm_cmpngt_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpngt_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for not-greater-than, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := !(a[i+63:i] > b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
cmpps
__m128 _mm_cmpngt_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for not-greater-than, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := !( a[i+31:i] > b[i+31:i] ) ? 0xffffffff : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
cmpsd
__m128d _mm_cmpngt_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpngt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b for not-greater-than, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := !(a[63:0] > b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
cmpss
__m128 _mm_cmpngt_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpngt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b for not-greater-than, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := !( a[31:0] > b[31:0] ) ? 0xffffffff : 0 dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
cmppd
__m128d _mm_cmpnle_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpnle_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := !(a[i+63:i] <= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcmppd
__mmask8 _mm512_cmpnle_pd_mask (__m512d a, __m512d b)

Synopsis

__mmask8 _mm512_cmpnle_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := !(a[i+63:i] <= b[i+63:i]) ? 1 : 0 ENDFOR k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmpnle_pd_mask (__mmask8 k1, __m512d a, __m512d b)

Synopsis

__mmask8 _mm512_mask_cmpnle_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := !(a[i+63:i] <= b[i+63:i]) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
cmpps
__m128 _mm_cmpnle_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpnle_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := !( a[i+31:i] <= b[i+31:i] ) ? 0xffffffff : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
vcmpps
__mmask16 _mm512_cmpnle_ps_mask (__m512 a, __m512 b)

Synopsis

__mmask16 _mm512_cmpnle_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := !(a[i+31:i] <= b[i+31:i]) ? 1 : 0 ENDFOR k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmpnle_ps_mask (__mmask16 k1, __m512 a, __m512 b)

Synopsis

__mmask16 _mm512_mask_cmpnle_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := !(a[i+31:i] <= b[i+31:i]) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
cmpsd
__m128d _mm_cmpnle_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpnle_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := !(a[63:0] <= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
cmpss
__m128 _mm_cmpnle_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpnle_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := !( a[31:0] <= b[31:0] ) ? 0xffffffff : 0 dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
cmppd
__m128d _mm_cmpnlt_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpnlt_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := !(a[i+63:i] < b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcmppd
__mmask8 _mm512_cmpnlt_pd_mask (__m512d a, __m512d b)

Synopsis

__mmask8 _mm512_cmpnlt_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := !(a[i+63:i] < b[i+63:i]) ? 1 : 0 ENDFOR k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmpnlt_pd_mask (__mmask8 k1, __m512d a, __m512d b)

Synopsis

__mmask8 _mm512_mask_cmpnlt_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := !(a[i+63:i] < b[i+63:i]) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
cmpps
__m128 _mm_cmpnlt_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpnlt_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := !( a[i+31:i] < b[i+31:i] ) ? 0xffffffff : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
vcmpps
__mmask16 _mm512_cmpnlt_ps_mask (__m512 a, __m512 b)

Synopsis

__mmask16 _mm512_cmpnlt_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := !(a[i+31:i] < b[i+31:i]) ? 1 : 0 ENDFOR k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmpnlt_ps_mask (__mmask16 k1, __m512 a, __m512 b)

Synopsis

__mmask16 _mm512_mask_cmpnlt_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := !(a[i+31:i] < b[i+31:i]) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
cmpsd
__m128d _mm_cmpnlt_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpnlt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b for not-less-than, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := !(a[63:0] < b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
cmpss
__m128 _mm_cmpnlt_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpnlt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b for not-less-than, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := !( a[31:0] < b[31:0] ) ? 0xffffffff : 0 dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
cmppd
__m128d _mm_cmpord_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpord_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcmppd
__mmask8 _mm512_cmpord_pd_mask (__m512d a, __m512d b)

Synopsis

__mmask8 _mm512_cmpord_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 ENDFOR k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmpord_pd_mask (__mmask8 k1, __m512d a, __m512d b)

Synopsis

__mmask8 _mm512_mask_cmpord_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
cmpps
__m128 _mm_cmpord_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpord_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] != NaN AND b[i+31:i] != NaN ) ? 0xffffffff : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
vcmpps
__mmask16 _mm512_cmpord_ps_mask (__m512 a, __m512 b)

Synopsis

__mmask16 _mm512_cmpord_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := (a[i+31:i] != NaN AND b[i+31:i] != NaN) ? 1 : 0 ENDFOR k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmpord_ps_mask (__mmask16 k1, __m512 a, __m512 b)

Synopsis

__mmask16 _mm512_mask_cmpord_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := (a[i+31:i] != NaN AND b[i+31:i] != NaN) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
cmpsd
__m128d _mm_cmpord_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpord_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := (a[63:0] != NaN AND b[63:0] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
cmpss
__m128 _mm_cmpord_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpord_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := ( a[31:0] != NaN AND b[31:0] != NaN ) ? 0xffffffff : 0 dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
cmppd
__m128d _mm_cmpunord_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpunord_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] != NaN OR b[i+63:i] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcmppd
__mmask8 _mm512_cmpunord_pd_mask (__m512d a, __m512d b)

Synopsis

__mmask8 _mm512_cmpunord_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.

Operation

FOR j := 0 to 7 i := j*64 k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 ENDFOR k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmpunord_pd_mask (__mmask8 k1, __m512d a, __m512d b)

Synopsis

__mmask8 _mm512_mask_cmpunord_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
cmpps
__m128 _mm_cmpunord_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpunord_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ( a[i+31:i] != NaN OR b[i+31:i] != NaN ) ? 0xffffffff : 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
vcmpps
__mmask16 _mm512_cmpunord_ps_mask (__m512 a, __m512 b)

Synopsis

__mmask16 _mm512_cmpunord_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.

Operation

FOR j := 0 to 15 i := j*32 k[j] := (a[i+31:i] == NaN OR b[i+31:i] == NaN) ? 1 : 0 ENDFOR k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmpunord_ps_mask (__mmask16 k1, __m512 a, __m512 b)

Synopsis

__mmask16 _mm512_mask_cmpunord_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := (a[i+31:i] == NaN OR b[i+31:i] == NaN) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
cmpsd
__m128d _mm_cmpunord_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_cmpunord_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b to see if either is NaN, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := (a[63:0] != NaN OR b[63:0] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
cmpss
__m128 _mm_cmpunord_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_cmpunord_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b to see if either is NaN, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := ( a[31:0] != NaN OR b[31:0] != NaN ) ? 0xffffffff : 0 dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
vcomisd
int _mm_comi_round_sd (__m128d a, __m128d b, const int imm8, const int sae)

Synopsis

int _mm_comi_round_sd (__m128d a, __m128d b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcomisd xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC RETURN ( a[63:0] OP b[63:0] ) ? 1 : 0
vcomiss
int _mm_comi_round_ss (__m128 a, __m128 b, const int imm8, const int sae)

Synopsis

int _mm_comi_round_ss (__m128 a, __m128 b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcomiss xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC RETURN ( a[31:0] OP b[31:0] ) ? 1 : 0
comisd
int _mm_comieq_sd (__m128d a, __m128d b)

Synopsis

int _mm_comieq_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: comisd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point element in a and b for equality, and return the boolean result (0 or 1).

Operation

RETURN ( a[63:0] == b[63:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
comiss
int _mm_comieq_ss (__m128 a, __m128 b)

Synopsis

int _mm_comieq_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: comiss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point element in a and b for equality, and return the boolean result (0 or 1).

Operation

RETURN ( a[31:0] == b[31:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
comisd
int _mm_comige_sd (__m128d a, __m128d b)

Synopsis

int _mm_comige_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: comisd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point element in a and b for greater-than-or-equal, and return the boolean result (0 or 1).

Operation

RETURN ( a[63:0] >= b[63:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
comiss
int _mm_comige_ss (__m128 a, __m128 b)

Synopsis

int _mm_comige_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: comiss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point element in a and b for greater-than-or-equal, and return the boolean result (0 or 1).

Operation

RETURN ( a[31:0] >= b[31:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
comisd
int _mm_comigt_sd (__m128d a, __m128d b)

Synopsis

int _mm_comigt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: comisd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point element in a and b for greater-than, and return the boolean result (0 or 1).

Operation

RETURN ( a[63:0] > b[63:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
comiss
int _mm_comigt_ss (__m128 a, __m128 b)

Synopsis

int _mm_comigt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: comiss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point element in a and b for greater-than, and return the boolean result (0 or 1).

Operation

RETURN ( a[31:0] > b[31:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
comisd
int _mm_comile_sd (__m128d a, __m128d b)

Synopsis

int _mm_comile_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: comisd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point element in a and b for less-than-or-equal, and return the boolean result (0 or 1).

Operation

RETURN ( a[63:0] <= b[63:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
comiss
int _mm_comile_ss (__m128 a, __m128 b)

Synopsis

int _mm_comile_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: comiss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point element in a and b for less-than-or-equal, and return the boolean result (0 or 1).

Operation

RETURN ( a[31:0] <= b[31:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
comisd
int _mm_comilt_sd (__m128d a, __m128d b)

Synopsis

int _mm_comilt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: comisd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point element in a and b for less-than, and return the boolean result (0 or 1).

Operation

RETURN ( a[63:0] < b[63:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
comiss
int _mm_comilt_ss (__m128 a, __m128 b)

Synopsis

int _mm_comilt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: comiss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point element in a and b for less-than, and return the boolean result (0 or 1).

Operation

RETURN ( a[31:0] < b[31:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
comisd
int _mm_comineq_sd (__m128d a, __m128d b)

Synopsis

int _mm_comineq_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: comisd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point element in a and b for not-equal, and return the boolean result (0 or 1).

Operation

RETURN ( a[63:0] != b[63:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
comiss
int _mm_comineq_ss (__m128 a, __m128 b)

Synopsis

int _mm_comineq_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: comiss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point element in a and b for not-equal, and return the boolean result (0 or 1).

Operation

RETURN ( a[31:0] != b[31:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
vpcompressd
__m128i _mm_mask_compress_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_compress_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpcompressd
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.

Operation

size := 32 m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[127:m] := src[127:m] dst[MAX:128] := 0
vpcompressd
__m128i _mm_maskz_compress_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_compress_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpcompressd
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.

Operation

size := 32 m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[127:m] := 0 dst[MAX:128] := 0
vpcompressd
__m256i _mm256_mask_compress_epi32 (__m256i src, __mmask8 k, __m256i a)

Synopsis

__m256i _mm256_mask_compress_epi32 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpcompressd
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.

Operation

size := 32 m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[255:m] := src[255:m] dst[MAX:256] := 0
vpcompressd
__m256i _mm256_maskz_compress_epi32 (__mmask8 k, __m256i a)

Synopsis

__m256i _mm256_maskz_compress_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpcompressd
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.

Operation

size := 32 m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[255:m] := 0 dst[MAX:256] := 0
vpcompressd
__m512i _mm512_mask_compress_epi32 (__m512i src, __mmask16 k, __m512i a)

Synopsis

__m512i _mm512_mask_compress_epi32 (__m512i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpcompressd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.

Operation

size := 32 m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[511:m] := src[511:m] dst[MAX:512] := 0
vpcompressd
__m512i _mm512_maskz_compress_epi32 (__mmask16 k, __m512i a)

Synopsis

__m512i _mm512_maskz_compress_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpcompressd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.

Operation

size := 32 m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[511:m] := 0 dst[MAX:512] := 0
vpcompressq
__m128i _mm_mask_compress_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_compress_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpcompressq
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.

Operation

size := 64 m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[127:m] := src[127:m] dst[MAX:128] := 0
vpcompressq
__m128i _mm_maskz_compress_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_compress_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpcompressq
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.

Operation

size := 64 m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[127:m] := 0 dst[MAX:128] := 0
vpcompressq
__m256i _mm256_mask_compress_epi64 (__m256i src, __mmask8 k, __m256i a)

Synopsis

__m256i _mm256_mask_compress_epi64 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpcompressq
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.

Operation

size := 64 m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[255:m] := src[255:m] dst[MAX:256] := 0
vpcompressq
__m256i _mm256_maskz_compress_epi64 (__mmask8 k, __m256i a)

Synopsis

__m256i _mm256_maskz_compress_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpcompressq
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.

Operation

size := 64 m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[255:m] := 0 dst[MAX:256] := 0
vpcompressq
__m512i _mm512_mask_compress_epi64 (__m512i src, __mmask8 k, __m512i a)

Synopsis

__m512i _mm512_mask_compress_epi64 (__m512i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpcompressq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.

Operation

size := 64 m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[511:m] := src[511:m] dst[MAX:512] := 0
vpcompressq
__m512i _mm512_maskz_compress_epi64 (__mmask8 k, __m512i a)

Synopsis

__m512i _mm512_maskz_compress_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpcompressq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.

Operation

size := 64 m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[511:m] := 0 dst[MAX:512] := 0
vcompresspd
__m128d _mm_mask_compress_pd (__m128d src, __mmask8 k, __m128d a)

Synopsis

__m128d _mm_mask_compress_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcompresspd
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.

Operation

size := 64 m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[127:m] := src[127:m] dst[MAX:128] := 0
vcompresspd
__m128d _mm_maskz_compress_pd (__mmask8 k, __m128d a)

Synopsis

__m128d _mm_maskz_compress_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcompresspd
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.

Operation

size := 64 m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[127:m] := 0 dst[MAX:128] := 0
vcompresspd
__m256d _mm256_mask_compress_pd (__m256d src, __mmask8 k, __m256d a)

Synopsis

__m256d _mm256_mask_compress_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcompresspd
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.

Operation

size := 64 m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[255:m] := src[255:m] dst[MAX:256] := 0
vcompresspd
__m256d _mm256_maskz_compress_pd (__mmask8 k, __m256d a)

Synopsis

__m256d _mm256_maskz_compress_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcompresspd
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.

Operation

size := 64 m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[255:m] := 0 dst[MAX:256] := 0
vcompresspd
__m512d _mm512_mask_compress_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_compress_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcompresspd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.

Operation

size := 64 m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[511:m] := src[511:m] dst[MAX:512] := 0
vcompresspd
__m512d _mm512_maskz_compress_pd (__mmask8 k, __m512d a)

Synopsis

__m512d _mm512_maskz_compress_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcompresspd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.

Operation

size := 64 m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[511:m] := 0 dst[MAX:512] := 0
vcompressps
__m128 _mm_mask_compress_ps (__m128 src, __mmask8 k, __m128 a)

Synopsis

__m128 _mm_mask_compress_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcompressps
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.

Operation

size := 32 m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[127:m] := src[127:m] dst[MAX:128] := 0
vcompressps
__m128 _mm_maskz_compress_ps (__mmask8 k, __m128 a)

Synopsis

__m128 _mm_maskz_compress_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcompressps
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.

Operation

size := 32 m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[127:m] := 0 dst[MAX:128] := 0
vcompressps
__m256 _mm256_mask_compress_ps (__m256 src, __mmask8 k, __m256 a)

Synopsis

__m256 _mm256_mask_compress_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcompressps
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.

Operation

size := 32 m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[255:m] := src[255:m] dst[MAX:256] := 0
vcompressps
__m256 _mm256_maskz_compress_ps (__mmask8 k, __m256 a)

Synopsis

__m256 _mm256_maskz_compress_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcompressps
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.

Operation

size := 32 m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[255:m] := 0 dst[MAX:256] := 0
vcompressps
__m512 _mm512_mask_compress_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_compress_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcompressps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.

Operation

size := 32 m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[511:m] := src[511:m] dst[MAX:512] := 0
vcompressps
__m512 _mm512_maskz_compress_ps (__mmask16 k, __m512 a)

Synopsis

__m512 _mm512_maskz_compress_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcompressps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.

Operation

size := 32 m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[511:m] := 0 dst[MAX:512] := 0
vpcompressd
void _mm_mask_compressstoreu_epi32 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_compressstoreu_epi32 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpcompressd
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

size := 32 m := base_addr FOR j := 0 to 3 i := j*32 IF k[j] MEM[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR
vpcompressd
void _mm256_mask_compressstoreu_epi32 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_compressstoreu_epi32 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpcompressd
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

size := 32 m := base_addr FOR j := 0 to 7 i := j*32 IF k[j] MEM[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR
vpcompressd
void _mm512_mask_compressstoreu_epi32 (void* base_addr, __mmask16 k, __m512i a)

Synopsis

void _mm512_mask_compressstoreu_epi32 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpcompressd m32 {k}, zmm
CPUID Flags: AVX512F

Description

Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

size := 32 m := base_addr FOR j := 0 to 15 i := j*32 IF k[j] MEM[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR
vpcompressq
void _mm_mask_compressstoreu_epi64 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_compressstoreu_epi64 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpcompressq
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

size := 64 m := base_addr FOR j := 0 to 1 i := j*64 IF k[j] MEM[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR
vpcompressq
void _mm256_mask_compressstoreu_epi64 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_compressstoreu_epi64 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpcompressq
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

size := 64 m := base_addr FOR j := 0 to 3 i := j*64 IF k[j] MEM[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR
vpcompressq
void _mm512_mask_compressstoreu_epi64 (void* base_addr, __mmask8 k, __m512i a)

Synopsis

void _mm512_mask_compressstoreu_epi64 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpcompressq m64 {k}, zmm
CPUID Flags: AVX512F

Description

Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

size := 64 m := base_addr FOR j := 0 to 7 i := j*64 IF k[j] MEM[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR
vcompresspd
void _mm_mask_compressstoreu_pd (void* base_addr, __mmask8 k, __m128d a)

Synopsis

void _mm_mask_compressstoreu_pd (void* base_addr, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcompresspd
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

size := 64 m := base_addr FOR j := 0 to 1 i := j*64 IF k[j] MEM[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR
vcompresspd
void _mm256_mask_compressstoreu_pd (void* base_addr, __mmask8 k, __m256d a)

Synopsis

void _mm256_mask_compressstoreu_pd (void* base_addr, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcompresspd
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

size := 64 m := base_addr FOR j := 0 to 3 i := j*64 IF k[j] MEM[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR
vcompresspd
void _mm512_mask_compressstoreu_pd (void* base_addr, __mmask8 k, __m512d a)

Synopsis

void _mm512_mask_compressstoreu_pd (void* base_addr, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcompresspd m512 {k}, zmm
CPUID Flags: AVX512F

Description

Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

size := 64 m := base_addr FOR j := 0 to 7 i := j*64 IF k[j] MEM[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR
vcompressps
void _mm_mask_compressstoreu_ps (void* base_addr, __mmask8 k, __m128 a)

Synopsis

void _mm_mask_compressstoreu_ps (void* base_addr, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcompressps
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

size := 32 m := base_addr FOR j := 0 to 3 i := j*32 IF k[j] MEM[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR
vcompressps
void _mm256_mask_compressstoreu_ps (void* base_addr, __mmask8 k, __m256 a)

Synopsis

void _mm256_mask_compressstoreu_ps (void* base_addr, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcompressps
CPUID Flags: AVX512VL + AVX512F

Description

Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

size := 32 m := base_addr FOR j := 0 to 7 i := j*32 IF k[j] MEM[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR
vcompressps
void _mm512_mask_compressstoreu_ps (void* base_addr, __mmask16 k, __m512 a)

Synopsis

void _mm512_mask_compressstoreu_ps (void* base_addr, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcompressps m512 {k}, zmm
CPUID Flags: AVX512F

Description

Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

size := 32 m := base_addr FOR j := 0 to 15 i := j*32 IF k[j] MEM[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR
vpconflictd
__m128i _mm_conflict_epi32 (__m128i a)

Synopsis

__m128i _mm_conflict_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpconflictd
CPUID Flags: AVX512VL + AVX512CD

Description

Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 3 i := j*32 FOR k := 0 to j-1 m := k*32 dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 ENDFOR dst[i+31:i+j] := 0 ENDFOR dst[MAX:128] := 0
vpconflictd
__m128i _mm_mask_conflict_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_conflict_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpconflictd
CPUID Flags: AVX512VL + AVX512CD

Description

Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 3 i := j*32 IF k[i] FOR l := 0 to j-1 m := l*32 dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 ENDFOR dst[i+31:i+j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpconflictd
__m128i _mm_maskz_conflict_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_conflict_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpconflictd
CPUID Flags: AVX512VL + AVX512CD

Description

Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 3 i := j*32 IF k[i] FOR l := 0 to j-1 m := l*32 dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 ENDFOR dst[i+31:i+j] := 0 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpconflictd
__m256i _mm256_conflict_epi32 (__m256i a)

Synopsis

__m256i _mm256_conflict_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpconflictd
CPUID Flags: AVX512VL + AVX512CD

Description

Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 7 i := j*32 FOR k := 0 to j-1 m := k*32 dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 ENDFOR dst[i+31:i+j] := 0 ENDFOR dst[MAX:256] := 0
vpconflictd
__m256i _mm256_mask_conflict_epi32 (__m256i src, __mmask8 k, __m256i a)

Synopsis

__m256i _mm256_mask_conflict_epi32 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpconflictd
CPUID Flags: AVX512VL + AVX512CD

Description

Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 7 i := j*32 IF k[i] FOR l := 0 to j-1 m := l*32 dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 ENDFOR dst[i+31:i+j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpconflictd
__m256i _mm256_maskz_conflict_epi32 (__mmask8 k, __m256i a)

Synopsis

__m256i _mm256_maskz_conflict_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpconflictd
CPUID Flags: AVX512VL + AVX512CD

Description

Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 7 i := j*32 IF k[i] FOR l := 0 to j-1 m := l*32 dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 ENDFOR dst[i+31:i+j] := 0 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpconflictd
__m512i _mm512_conflict_epi32 (__m512i a)

Synopsis

__m512i _mm512_conflict_epi32 (__m512i a)
#include "immintrin.h"
Instruction: vpconflictd zmm {k}, zmm
CPUID Flags: AVX512CD

Description

Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 15 i := j*32 FOR k := 0 to j-1 m := k*32 dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 ENDFOR dst[i+31:i+j] := 0 ENDFOR dst[MAX:512] := 0
vpconflictd
__m512i _mm512_mask_conflict_epi32 (__m512i src, __mmask16 k, __m512i a)

Synopsis

__m512i _mm512_mask_conflict_epi32 (__m512i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpconflictd zmm {k}, zmm
CPUID Flags: AVX512CD

Description

Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 15 i := j*32 IF k[i] FOR l := 0 to j-1 m := l*32 dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 ENDFOR dst[i+31:i+j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpconflictd
__m512i _mm512_maskz_conflict_epi32 (__mmask16 k, __m512i a)

Synopsis

__m512i _mm512_maskz_conflict_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpconflictd zmm {k}, zmm
CPUID Flags: AVX512CD

Description

Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 15 i := j*32 IF k[i] FOR l := 0 to j-1 m := l*32 dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 ENDFOR dst[i+31:i+j] := 0 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpconflictq
__m128i _mm_conflict_epi64 (__m128i a)

Synopsis

__m128i _mm_conflict_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpconflictq
CPUID Flags: AVX512VL + AVX512CD

Description

Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 1 i := j*64 FOR k := 0 to j-1 m := k*64 dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 ENDFOR dst[i+63:i+j] := 0 ENDFOR dst[MAX:128] := 0
vpconflictq
__m128i _mm_mask_conflict_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_conflict_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpconflictq
CPUID Flags: AVX512VL + AVX512CD

Description

Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] FOR l := 0 to j-1 m := l*64 dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 ENDFOR dst[i+63:i+j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpconflictq
__m128i _mm_maskz_conflict_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_conflict_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpconflictq
CPUID Flags: AVX512VL + AVX512CD

Description

Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] FOR l := 0 to j-1 m := l*64 dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 ENDFOR dst[i+63:i+j] := 0 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpconflictq
__m256i _mm256_conflict_epi64 (__m256i a)

Synopsis

__m256i _mm256_conflict_epi64 (__m256i a)
#include "immintrin.h"
Instruction: vpconflictq
CPUID Flags: AVX512VL + AVX512CD

Description

Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 3 i := j*64 FOR k := 0 to j-1 m := k*64 dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 ENDFOR dst[i+63:i+j] := 0 ENDFOR dst[MAX:256] := 0
vpconflictq
__m256i _mm256_mask_conflict_epi64 (__m256i src, __mmask8 k, __m256i a)

Synopsis

__m256i _mm256_mask_conflict_epi64 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpconflictq
CPUID Flags: AVX512VL + AVX512CD

Description

Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] FOR l := 0 to j-1 m := l*64 dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 ENDFOR dst[i+63:i+j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpconflictq
__m256i _mm256_maskz_conflict_epi64 (__mmask8 k, __m256i a)

Synopsis

__m256i _mm256_maskz_conflict_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpconflictq
CPUID Flags: AVX512VL + AVX512CD

Description

Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] FOR l := 0 to j-1 m := l*64 dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 ENDFOR dst[i+63:i+j] := 0 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpconflictq
__m512i _mm512_conflict_epi64 (__m512i a)

Synopsis

__m512i _mm512_conflict_epi64 (__m512i a)
#include "immintrin.h"
Instruction: vpconflictq zmm {k}, zmm
CPUID Flags: AVX512CD

Description

Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 7 i := j*64 FOR k := 0 to j-1 m := k*64 dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 ENDFOR dst[i+63:i+j] := 0 ENDFOR dst[MAX:512] := 0
vpconflictq
__m512i _mm512_mask_conflict_epi64 (__m512i src, __mmask8 k, __m512i a)

Synopsis

__m512i _mm512_mask_conflict_epi64 (__m512i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpconflictq zmm {k}, zmm
CPUID Flags: AVX512CD

Description

Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] FOR l := 0 to j-1 m := l*64 dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 ENDFOR dst[i+63:i+j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpconflictq
__m512i _mm512_maskz_conflict_epi64 (__mmask8 k, __m512i a)

Synopsis

__m512i _mm512_maskz_conflict_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpconflictq zmm {k}, zmm
CPUID Flags: AVX512CD

Description

Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] FOR l := 0 to j-1 m := l*64 dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 ENDFOR dst[i+63:i+j] := 0 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_cos_pd (__m128d a)

Synopsis

__m128d _mm_cos_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := COS(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_cos_pd (__m256d a)

Synopsis

__m256d _mm256_cos_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := COS(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_cos_pd (__m512d a)

Synopsis

__m512d _mm512_cos_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := COS(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_cos_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_cos_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := COS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_cos_ps (__m128 a)

Synopsis

__m128 _mm_cos_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := COS(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_cos_ps (__m256 a)

Synopsis

__m256 _mm256_cos_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := COS(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_cos_ps (__m512 a)

Synopsis

__m512 _mm512_cos_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := COS(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_cos_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_cos_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := COS(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_cosd_pd (__m128d a)

Synopsis

__m128d _mm_cosd_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := COSD(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_cosd_pd (__m256d a)

Synopsis

__m256d _mm256_cosd_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := COSD(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_cosd_pd (__m512d a)

Synopsis

__m512d _mm512_cosd_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := COSD(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_cosd_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_cosd_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := COSD(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_cosd_ps (__m128 a)

Synopsis

__m128 _mm_cosd_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := COSD(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_cosd_ps (__m256 a)

Synopsis

__m256 _mm256_cosd_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := COSD(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_cosd_ps (__m512 a)

Synopsis

__m512 _mm512_cosd_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := COSD(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_cosd_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_cosd_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := COSD(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_cosh_pd (__m128d a)

Synopsis

__m128d _mm_cosh_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := COSH(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_cosh_pd (__m256d a)

Synopsis

__m256d _mm256_cosh_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := COSH(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_cosh_pd (__m512d a)

Synopsis

__m512d _mm512_cosh_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := COSH(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_cosh_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_cosh_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := COSH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_cosh_ps (__m128 a)

Synopsis

__m128 _mm_cosh_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := COSH(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_cosh_ps (__m256 a)

Synopsis

__m256 _mm256_cosh_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := COSH(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_cosh_ps (__m512 a)

Synopsis

__m512 _mm512_cosh_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := COSH(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_cosh_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_cosh_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := COSH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
popcnt
unsigned int _mm_countbits_32 (unsigned int r1)

Synopsis

unsigned int _mm_countbits_32 (unsigned int r1)
#include "immintrin.h"
Instruction: popcnt r32, r32
CPUID Flags: KNCNI

Description

Counts the number of set bits in 32-bit unsigned integer r1, returning the results in dst.

Operation

dst[31:0] := PopCount(r1[31:0])
popcnt
unsigned __int64 _mm_countbits_64 (unsigned __int64 r1)

Synopsis

unsigned __int64 _mm_countbits_64 (unsigned __int64 r1)
#include "immintrin.h"
Instruction: popcnt r64, r64
CPUID Flags: KNCNI

Description

Counts the number of set bits in double-precision (32-bit) unsigned integer r1, returning the results in dst.

Operation

dst[63:0] := PopCount(r1[63:0])
crc32
unsigned int _mm_crc32_u16 (unsigned int crc, unsigned short v)

Synopsis

unsigned int _mm_crc32_u16 (unsigned int crc, unsigned short v)
#include "nmmintrin.h"
Instruction: crc32 r32, r16
CPUID Flags: SSE4.2

Description

Starting with the initial value in crc, accumulates a CRC32 value for unsigned 16-bit integer v, and stores the result in dst.

Operation

tmp1[15:0] := v[0:15] // bit reflection tmp2[31:0] := crc[0:31] // bit reflection tmp3[47:0] := tmp1[15:0] << 32 tmp4[47:0] := tmp2[31:0] << 16 tmp5[47:0] := tmp3[47:0] XOR tmp4[47:0] tmp6[31:0] := tmp5[47:0] MOD2 0x11EDC6F41 dst[31:0] := tmp6[0:31] // bit reflection

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
crc32
unsigned int _mm_crc32_u32 (unsigned int crc, unsigned int v)

Synopsis

unsigned int _mm_crc32_u32 (unsigned int crc, unsigned int v)
#include "nmmintrin.h"
Instruction: crc32 r32, r32
CPUID Flags: SSE4.2

Description

Starting with the initial value in crc, accumulates a CRC32 value for unsigned 32-bit integer v, and stores the result in dst.

Operation

tmp1[31:0] := v[0:31] // bit reflection tmp2[31:0] := crc[0:31] // bit reflection tmp3[63:0] := tmp1[31:0] << 32 tmp4[63:0] := tmp2[31:0] << 32 tmp5[63:0] := tmp3[63:0] XOR tmp4[63:0] tmp6[31:0] := tmp5[63:0] MOD2 0x11EDC6F41 dst[31:0] := tmp6[0:31] // bit reflection

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
crc32
unsigned __int64 _mm_crc32_u64 (unsigned __int64 crc, unsigned __int64 v)

Synopsis

unsigned __int64 _mm_crc32_u64 (unsigned __int64 crc, unsigned __int64 v)
#include "nmmintrin.h"
Instruction: crc32 r64, r64
CPUID Flags: SSE4.2

Description

Starting with the initial value in crc, accumulates a CRC32 value for unsigned 64-bit integer v, and stores the result in dst.

Operation

tmp1[63:0] := v[0:63] // bit reflection tmp2[31:0] := crc[0:31] // bit reflection tmp3[95:0] := tmp1[31:0] << 32 tmp4[95:0] := tmp2[63:0] << 64 tmp5[95:0] := tmp3[95:0] XOR tmp4[95:0] tmp6[31:0] := tmp5[95:0] MOD2 0x11EDC6F41 dst[31:0] := tmp6[0:31] // bit reflection

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
crc32
unsigned int _mm_crc32_u8 (unsigned int crc, unsigned char v)

Synopsis

unsigned int _mm_crc32_u8 (unsigned int crc, unsigned char v)
#include "nmmintrin.h"
Instruction: crc32 r32, r8
CPUID Flags: SSE4.2

Description

Starting with the initial value in crc, accumulates a CRC32 value for unsigned 8-bit integer v, and stores the result in dst.

Operation

tmp1[7:0] := v[0:7] // bit reflection tmp2[31:0] := crc[0:31] // bit reflection tmp3[39:0] := tmp1[7:0] << 32 tmp4[39:0] := tmp2[31:0] << 8 tmp5[39:0] := tmp3[39:0] XOR tmp4[39:0] tmp6[31:0] := tmp5[39:0] MOD2 0x11EDC6F41 dst[31:0] := tmp6[0:31] // bit reflection

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
...
__m128 _mm_csqrt_ps (__m128 a)

Synopsis

__m128 _mm_csqrt_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the square root of packed complex single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_csqrt_ps (__m256 a)

Synopsis

__m256 _mm256_csqrt_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the square root of packed complex single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]) ENDFOR dst[MAX:256] := 0
cvtpi2ps
__m128 _mm_cvt_pi2ps (__m128 a, __m64 b)

Synopsis

__m128 _mm_cvt_pi2ps (__m128 a, __m64 b)
#include "xmmintrin.h"
Instruction: cvtpi2ps xmm, mm
CPUID Flags: SSE

Description

Convert packed 32-bit integers in b to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of dst, and copy the upper 2 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := Convert_Int32_To_FP32(b[31:0]) dst[63:32] := Convert_Int32_To_FP32(b[63:32]) dst[95:64] := a[95:64] dst[127:96] := a[127:96]

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
cvtps2pi
__m64 _mm_cvt_ps2pi (__m128 a)

Synopsis

__m64 _mm_cvt_ps2pi (__m128 a)
#include "xmmintrin.h"
Instruction: cvtps2pi mm, xmm
CPUID Flags: SSE

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcvtdq2ps
__m512 _mm512_cvt_roundepi32_ps (__m512i a, int rounding)

Synopsis

__m512 _mm512_cvt_roundepi32_ps (__m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtdq2ps zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvtdq2ps
__m512 _mm512_mask_cvt_roundepi32_ps (__m512 src, __mmask16 k, __m512i a, int rounding)

Synopsis

__m512 _mm512_mask_cvt_roundepi32_ps (__m512 src, __mmask16 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtdq2ps zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtdq2ps
__m512 _mm512_maskz_cvt_roundepi32_ps (__mmask16 k, __m512i a, int rounding)

Synopsis

__m512 _mm512_maskz_cvt_roundepi32_ps (__mmask16 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtdq2ps zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtqq2pd
__m512d _mm512_cvt_roundepi64_pd (__m512i a, int rounding)

Synopsis

__m512d _mm512_cvt_roundepi64_pd (__m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512DQ

Description

Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvtqq2pd
__m512d _mm512_mask_cvt_roundepi64_pd (__m512d src, __mmask8 k, __m512i a, int rounding)

Synopsis

__m512d _mm512_mask_cvt_roundepi64_pd (__m512d src, __mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512DQ

Description

Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtqq2pd
__m512d _mm512_maskz_cvt_roundepi64_pd (__mmask8 k, __m512i a, int rounding)

Synopsis

__m512d _mm512_maskz_cvt_roundepi64_pd (__mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512DQ

Description

Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtqq2ps
__m256 _mm512_cvt_roundepi64_ps (__m512i a, int rounding)

Synopsis

__m256 _mm512_cvt_roundepi64_ps (__m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512DQ

Description

Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vcvtqq2ps
__m256 _mm512_mask_cvt_roundepi64_ps (__m256 src, __mmask8 k, __m512i a, int rounding)

Synopsis

__m256 _mm512_mask_cvt_roundepi64_ps (__m256 src, __mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512DQ

Description

Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:256] := 0
vcvtqq2ps
__m256 _mm512_maskz_cvt_roundepi64_ps (__mmask8 k, __m512i a, int rounding)

Synopsis

__m256 _mm512_maskz_cvt_roundepi64_ps (__mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512DQ

Description

Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtudq2ps
__m512 _mm512_cvt_roundepu32_ps (__m512i a, int rounding)

Synopsis

__m512 _mm512_cvt_roundepu32_ps (__m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtudq2ps zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvtudq2ps
__m512 _mm512_mask_cvt_roundepu32_ps (__m512 src, __mmask16 k, __m512i a, int rounding)

Synopsis

__m512 _mm512_mask_cvt_roundepu32_ps (__m512 src, __mmask16 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtudq2ps zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtudq2ps
__m512 _mm512_maskz_cvt_roundepu32_ps (__mmask16 k, __m512i a, int rounding)

Synopsis

__m512 _mm512_maskz_cvt_roundepu32_ps (__mmask16 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtudq2ps zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtuqq2pd
__m512d _mm512_cvt_roundepu64_pd (__m512i a, int rounding)

Synopsis

__m512d _mm512_cvt_roundepu64_pd (__m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvtuqq2pd
__m512d _mm512_mask_cvt_roundepu64_pd (__m512d src, __mmask8 k, __m512i a, int rounding)

Synopsis

__m512d _mm512_mask_cvt_roundepu64_pd (__m512d src, __mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtuqq2pd
__m512d _mm512_maskz_cvt_roundepu64_pd (__mmask8 k, __m512i a, int rounding)

Synopsis

__m512d _mm512_maskz_cvt_roundepu64_pd (__mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtuqq2ps
__m256 _mm512_cvt_roundepu64_ps (__m512i a, int rounding)

Synopsis

__m256 _mm512_cvt_roundepu64_ps (__m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vcvtuqq2ps
__m256 _mm512_mask_cvt_roundepu64_ps (__m256 src, __mmask8 k, __m512i a, int rounding)

Synopsis

__m256 _mm512_mask_cvt_roundepu64_ps (__m256 src, __mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:256] := 0
vcvtuqq2ps
__m256 _mm512_maskz_cvt_roundepu64_ps (__mmask8 k, __m512i a, int rounding)

Synopsis

__m256 _mm512_maskz_cvt_roundepu64_ps (__mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtsi2ss
__m128 _mm_cvt_roundi32_ss (__m128 a, int b, int rounding)

Synopsis

__m128 _mm_cvt_roundi32_ss (__m128 a, int b, int rounding)
#include "immintrin.h"
Instruction: vcvtsi2ss xmm, xmm, r32 {er}
CPUID Flags: AVX512F

Description

Convert the 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_Int32_To_FP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vcvtsi2sd
__m128d _mm_cvt_roundi64_sd (__m128d a, __int64 b, int rounding)

Synopsis

__m128d _mm_cvt_roundi64_sd (__m128d a, __int64 b, int rounding)
#include "immintrin.h"
Instruction: vcvtsi2sd xmm, xmm, r64 {er}
CPUID Flags: AVX512F

Description

Convert the 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_Int64_To_FP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vcvtsi2ss
__m128 _mm_cvt_roundi64_ss (__m128 a, __int64 b, int rounding)

Synopsis

__m128 _mm_cvt_roundi64_ss (__m128 a, __int64 b, int rounding)
#include "immintrin.h"
Instruction: vcvtsi2ss xmm, xmm, r64 {er}
CPUID Flags: AVX512F

Description

Convert the 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_Int64_To_FP32(b[63:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vcvtpd2dq
__m256i _mm512_cvt_roundpd_epi32 (__m512d a, int rounding)

Synopsis

__m256i _mm512_cvt_roundpd_epi32 (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2dq ymm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR dst[MAX:256] := 0
vcvtpd2dq
__m256i _mm512_mask_cvt_roundpd_epi32 (__m256i src, __mmask8 k, __m512d a, int rounding)

Synopsis

__m256i _mm512_mask_cvt_roundpd_epi32 (__m256i src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2dq ymm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvtpd2dq
__m256i _mm512_maskz_cvt_roundpd_epi32 (__mmask8 k, __m512d a, int rounding)

Synopsis

__m256i _mm512_maskz_cvt_roundpd_epi32 (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2dq ymm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtpd2qq
__m512i _mm512_cvt_roundpd_epi64 (__m512d a, int rounding)

Synopsis

__m512i _mm512_cvt_roundpd_epi64 (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvtpd2qq
__m512i _mm512_mask_cvt_roundpd_epi64 (__m512i src, __mmask8 k, __m512d a, int rounding)

Synopsis

__m512i _mm512_mask_cvt_roundpd_epi64 (__m512i src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtpd2qq
__m512i _mm512_maskz_cvt_roundpd_epi64 (__mmask8 k, __m512d a, int rounding)

Synopsis

__m512i _mm512_maskz_cvt_roundpd_epi64 (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtpd2udq
__m256i _mm512_cvt_roundpd_epu32 (__m512d a, int rounding)

Synopsis

__m256i _mm512_cvt_roundpd_epu32 (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2udq ymm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k]) ENDFOR dst[MAX:256] := 0
vcvtpd2udq
__m256i _mm512_mask_cvt_roundpd_epu32 (__m256i src, __mmask8 k, __m512d a, int rounding)

Synopsis

__m256i _mm512_mask_cvt_roundpd_epu32 (__m256i src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2udq ymm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvtpd2udq
__m256i _mm512_maskz_cvt_roundpd_epu32 (__mmask8 k, __m512d a, int rounding)

Synopsis

__m256i _mm512_maskz_cvt_roundpd_epu32 (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2udq ymm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtpd2uqq
__m512i _mm512_cvt_roundpd_epu64 (__m512d a, int rounding)

Synopsis

__m512i _mm512_cvt_roundpd_epu64 (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvtpd2uqq
__m512i _mm512_mask_cvt_roundpd_epu64 (__m512i src, __mmask8 k, __m512d a, int rounding)

Synopsis

__m512i _mm512_mask_cvt_roundpd_epu64 (__m512i src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtpd2uqq
__m512i _mm512_maskz_cvt_roundpd_epu64 (__mmask8 k, __m512d a, int rounding)

Synopsis

__m512i _mm512_maskz_cvt_roundpd_epu64 (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtpd2ps
__m256 _mm512_cvt_roundpd_ps (__m512d a, int rounding)

Synopsis

__m256 _mm512_cvt_roundpd_ps (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2ps ymm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) ENDFOR dst[MAX:256] := 0
vcvtpd2ps
__m256 _mm512_mask_cvt_roundpd_ps (__m256 src, __mmask8 k, __m512d a, int rounding)

Synopsis

__m256 _mm512_mask_cvt_roundpd_ps (__m256 src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2ps ymm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvtpd2ps
__m256 _mm512_maskz_cvt_roundpd_ps (__mmask8 k, __m512d a, int rounding)

Synopsis

__m256 _mm512_maskz_cvt_roundpd_ps (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2ps ymm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtpd2ps
__m512 _mm512_cvt_roundpd_pslo (__m512d v2, int rounding)

Synopsis

__m512 _mm512_cvt_roundpd_pslo (__m512d v2, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2ps zmm {k}, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to packed single-precision (32-bit) floating-point elements, storing the results in dst. Results are written to the lower half of dst, and the upper half locations are set to '0'.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 k := j*32 dst[k+31:k] := Float64ToFloat32(v2[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvtpd2ps
__m512 _mm512_mask_cvt_roundpd_pslo (__m512 src, __mmask8 k, __m512d v2, int rounding)

Synopsis

__m512 _mm512_mask_cvt_roundpd_pslo (__m512 src, __mmask8 k, __m512d v2, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2ps zmm {k}, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to packed single-precision (32-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Results are written to the lower half of dst, and the upper half locations are set to '0'.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[l+31:l] := Float64ToFloat32(v2[i+63:i]) ELSE dst[l+31:l] := src[l+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtph2ps
__m512 _mm512_cvt_roundph_ps (__m256i a, int sae)

Synopsis

__m512 _mm512_cvt_roundph_ps (__m256i a, int sae)
#include "immintrin.h"
Instruction: vcvtph2ps zmm {k}, ymm {sae}
CPUID Flags: AVX512F

Description

Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := j*32 m := j*16 dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ENDFOR dst[MAX:512] := 0
vcvtph2ps
__m512 _mm512_mask_cvt_roundph_ps (__m512 src, __mmask16 k, __m256i a, int sae)

Synopsis

__m512 _mm512_mask_cvt_roundph_ps (__m512 src, __mmask16 k, __m256i a, int sae)
#include "immintrin.h"
Instruction: vcvtph2ps zmm {k}, ymm {sae}
CPUID Flags: AVX512F

Description

Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtph2ps
__m512 _mm512_maskz_cvt_roundph_ps (__mmask16 k, __m256i a, int sae)

Synopsis

__m512 _mm512_maskz_cvt_roundph_ps (__mmask16 k, __m256i a, int sae)
#include "immintrin.h"
Instruction: vcvtph2ps zmm {k}, ymm {sae}
CPUID Flags: AVX512F

Description

Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtps2dq
__m512i _mm512_cvt_roundps_epi32 (__m512 a, int rounding)

Synopsis

__m512i _mm512_cvt_roundps_epi32 (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2dq zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvtps2dq
__m512i _mm512_mask_cvt_roundps_epi32 (__m512i src, __mmask16 k, __m512 a, int rounding)

Synopsis

__m512i _mm512_mask_cvt_roundps_epi32 (__m512i src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2dq zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtps2dq
__m512i _mm512_maskz_cvt_roundps_epi32 (__mmask16 k, __m512 a, int rounding)

Synopsis

__m512i _mm512_maskz_cvt_roundps_epi32 (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2dq zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtps2qq
__m512i _mm512_cvt_roundps_epi64 (__m256 a, int rounding)

Synopsis

__m512i _mm512_cvt_roundps_epi64 (__m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ENDFOR dst[MAX:512] := 0
vcvtps2qq
__m512i _mm512_mask_cvt_roundps_epi64 (__m512i src, __mmask8 k, __m256 a, int rounding)

Synopsis

__m512i _mm512_mask_cvt_roundps_epi64 (__m512i src, __mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtps2qq
__m512i _mm512_maskz_cvt_roundps_epi64 (__mmask8 k, __m256 a, int rounding)

Synopsis

__m512i _mm512_maskz_cvt_roundps_epi64 (__mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtps2udq
__m512i _mm512_cvt_roundps_epu32 (__m512 a, int rounding)

Synopsis

__m512i _mm512_cvt_roundps_epu32 (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2udq zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvtps2udq
__m512i _mm512_mask_cvt_roundps_epu32 (__m512i src, __mmask16 k, __m512 a, int rounding)

Synopsis

__m512i _mm512_mask_cvt_roundps_epu32 (__m512i src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2udq zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtps2udq
__m512i _mm512_maskz_cvt_roundps_epu32 (__mmask16 k, __m512 a, int rounding)

Synopsis

__m512i _mm512_maskz_cvt_roundps_epu32 (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2udq zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtps2uqq
__m512i _mm512_cvt_roundps_epu64 (__m256 a, int rounding)

Synopsis

__m512i _mm512_cvt_roundps_epu64 (__m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l]) ENDFOR dst[MAX:512] := 0
vcvtps2uqq
__m512i _mm512_mask_cvt_roundps_epu64 (__m512i src, __mmask8 k, __m256 a, int rounding)

Synopsis

__m512i _mm512_mask_cvt_roundps_epu64 (__m512i src, __mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtps2uqq
__m512i _mm512_maskz_cvt_roundps_epu64 (__mmask8 k, __m256 a, int rounding)

Synopsis

__m512i _mm512_maskz_cvt_roundps_epu64 (__mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtps2pd
__m512d _mm512_cvt_roundps_pd (__m256 a, int sae)

Synopsis

__m512d _mm512_cvt_roundps_pd (__m256 a, int sae)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, ymm {sae}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := 64*j k := 32*j dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) ENDFOR dst[MAX:512] := 0
vcvtps2pd
__m512d _mm512_mask_cvt_roundps_pd (__m512d src, __mmask8 k, __m256 a, int sae)

Synopsis

__m512d _mm512_mask_cvt_roundps_pd (__m512d src, __mmask8 k, __m256 a, int sae)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, ymm {sae}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtps2pd
__m512d _mm512_maskz_cvt_roundps_pd (__mmask8 k, __m256 a, int sae)

Synopsis

__m512d _mm512_maskz_cvt_roundps_pd (__mmask8 k, __m256 a, int sae)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, ymm {sae}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtps2ph
__m128i _mm_mask_cvt_roundps_ph (__m128i src, __mmask8 k, __m128 a, int rounding)

Synopsis

__m128i _mm_mask_cvt_roundps_ph (__m128i src, __mmask8 k, __m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 3 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:64] := 0
vcvtps2ph
__m128i _mm_maskz_cvt_roundps_ph (__mmask8 k, __m128 a, int rounding)

Synopsis

__m128i _mm_maskz_cvt_roundps_ph (__mmask8 k, __m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 3 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:64] := 0
vcvtps2ph
__m128i _mm256_mask_cvt_roundps_ph (__m128i src, __mmask8 k, __m256 a, int rounding)

Synopsis

__m128i _mm256_mask_cvt_roundps_ph (__m128i src, __mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vcvtps2ph
__m128i _mm256_maskz_cvt_roundps_ph (__mmask8 k, __m256 a, int rounding)

Synopsis

__m128i _mm256_maskz_cvt_roundps_ph (__mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtps2ph
__m256i _mm512_cvt_roundps_ph (__m512 a, int rounding)

Synopsis

__m256i _mm512_cvt_roundps_ph (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph ymm {k}, zmm {sae}, imm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 16*j l := 32*j dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ENDFOR dst[MAX:256] := 0
vcvtps2ph
__m256i _mm512_mask_cvt_roundps_ph (__m256i src, __mmask16 k, __m512 a, int rounding)

Synopsis

__m256i _mm512_mask_cvt_roundps_ph (__m256i src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph ymm {k}, zmm {sae}, imm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vcvtps2ph
__m256i _mm512_maskz_cvt_roundps_ph (__mmask16 k, __m512 a, int rounding)

Synopsis

__m256i _mm512_maskz_cvt_roundps_ph (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph ymm {k}, zmm {sae}, imm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtsd2si
int _mm_cvt_roundsd_i32 (__m128d a, int rounding)

Synopsis

int _mm_cvt_roundsd_i32 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2si r32, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP64_To_Int32(a[63:0])
vcvtsd2si
__int64 _mm_cvt_roundsd_i64 (__m128d a, int rounding)

Synopsis

__int64 _mm_cvt_roundsd_i64 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2si r64, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP64_To_Int64(a[63:0])
vcvtsd2si
int _mm_cvt_roundsd_si32 (__m128d a, int rounding)

Synopsis

int _mm_cvt_roundsd_si32 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2si r32, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP64_To_Int32(a[63:0])
vcvtsd2si
__int64 _mm_cvt_roundsd_si64 (__m128d a, int rounding)

Synopsis

__int64 _mm_cvt_roundsd_si64 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2si r64, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP64_To_Int64(a[63:0])
vcvtsd2ss
__m128 _mm_cvt_roundsd_ss (__m128 a, __m128d b, int rounding)

Synopsis

__m128 _mm_cvt_roundsd_ss (__m128 a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP64_To_FP32(b[63:0]) dst[127:32] := a[127:31] dst[MAX:64] := 0
vcvtsd2ss
__m128 _mm_mask_cvt_roundsd_ss (__m128 src, __mmask8 k, __m128 a, __m128d b, int rounding)

Synopsis

__m128 _mm_mask_cvt_roundsd_ss (__m128 src, __mmask8 k, __m128 a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := Convert_FP64_To_FP32(b[63:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:31] dst[MAX:64] := 0
vcvtsd2ss
__m128 _mm_maskz_cvt_roundsd_ss (__mmask8 k, __m128 a, __m128d b, int rounding)

Synopsis

__m128 _mm_maskz_cvt_roundsd_ss (__mmask8 k, __m128 a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := Convert_FP64_To_FP32(b[63:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:31] dst[MAX:64] := 0
vcvtsd2usi
unsigned int _mm_cvt_roundsd_u32 (__m128d a, int rounding)

Synopsis

unsigned int _mm_cvt_roundsd_u32 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2usi r32, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP64_To_UnsignedInt32(a[63:0])
vcvtsd2usi
unsigned __int64 _mm_cvt_roundsd_u64 (__m128d a, int rounding)

Synopsis

unsigned __int64 _mm_cvt_roundsd_u64 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2usi r64, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP64_To_UnsignedInt64(a[63:0])
vcvtsi2ss
__m128 _mm_cvt_roundsi32_ss (__m128 a, int b, int rounding)

Synopsis

__m128 _mm_cvt_roundsi32_ss (__m128 a, int b, int rounding)
#include "immintrin.h"
Instruction: vcvtsi2ss xmm, xmm, r32 {er}
CPUID Flags: AVX512F

Description

Convert the 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_Int32_To_FP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vcvtsi2sd
__m128d _mm_cvt_roundsi64_sd (__m128d a, __int64 b, int rounding)

Synopsis

__m128d _mm_cvt_roundsi64_sd (__m128d a, __int64 b, int rounding)
#include "immintrin.h"
Instruction: vcvtsi2sd xmm, xmm, r64 {er}
CPUID Flags: AVX512F

Description

Convert the 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_Int64_To_FP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vcvtsi2ss
__m128 _mm_cvt_roundsi64_ss (__m128 a, __int64 b, int rounding)

Synopsis

__m128 _mm_cvt_roundsi64_ss (__m128 a, __int64 b, int rounding)
#include "immintrin.h"
Instruction: vcvtsi2ss xmm, xmm, r64 {er}
CPUID Flags: AVX512F

Description

Convert the 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_Int64_To_FP32(b[63:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vcvtss2si
int _mm_cvt_roundss_i32 (__m128 a, int rounding)

Synopsis

int _mm_cvt_roundss_i32 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtss2si r32, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP32_To_Int32(a[31:0])
vcvtss2si
__int64 _mm_cvt_roundss_i64 (__m128 a, int rounding)

Synopsis

__int64 _mm_cvt_roundss_i64 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtss2si r64, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP32_To_Int64(a[31:0])
vcvtss2sd
__m128d _mm_cvt_roundss_sd (__m128d a, __m128 b, int rounding)

Synopsis

__m128d _mm_cvt_roundss_sd (__m128d a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vcvtss2sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP32_To_FP64(b[31:0]) dst[127:64] := a[127:64] dst[MAX:64] := 0
vcvtss2sd
__m128d _mm_mask_cvt_roundss_sd (__m128d src, __mmask8 k, __m128d a, __m128 b, int rounding)

Synopsis

__m128d _mm_mask_cvt_roundss_sd (__m128d src, __mmask8 k, __m128d a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vcvtss2sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := Convert_FP32_To_FP64(b[31:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:64] := 0
vcvtss2sd
__m128d _mm_maskz_cvt_roundss_sd (__mmask8 k, __m128d a, __m128 b, int rounding)

Synopsis

__m128d _mm_maskz_cvt_roundss_sd (__mmask8 k, __m128d a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vcvtss2sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := Convert_FP32_To_FP64(b[31:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:64] := 0
vcvtss2si
int _mm_cvt_roundss_si32 (__m128 a, int rounding)

Synopsis

int _mm_cvt_roundss_si32 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtss2si r32, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP32_To_Int32(a[31:0])
vcvtss2si
__int64 _mm_cvt_roundss_si64 (__m128 a, int rounding)

Synopsis

__int64 _mm_cvt_roundss_si64 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtss2si r64, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP32_To_Int64(a[31:0])
vcvtss2usi
unsigned int _mm_cvt_roundss_u32 (__m128 a, int rounding)

Synopsis

unsigned int _mm_cvt_roundss_u32 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtss2usi r32, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP32_To_UnsignedInt32(a[31:0])
vcvtss2usi
unsigned __int64 _mm_cvt_roundss_u64 (__m128 a, int rounding)

Synopsis

unsigned __int64 _mm_cvt_roundss_u64 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtss2usi r64, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP32_To_UnsignedInt64(a[31:0])
vcvtusi2ss
__m128 _mm_cvt_roundu32_ss (__m128 a, unsigned int b, int rounding)

Synopsis

__m128 _mm_cvt_roundu32_ss (__m128 a, unsigned int b, int rounding)
#include "immintrin.h"
Instruction: vcvtusi2ss xmm, xmm, r32 {er}
CPUID Flags: AVX512F

Description

Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_UnsignedInt32_To_FP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vcvtusi2sd
__m128d _mm_cvt_roundu64_sd (__m128d a, unsigned __int64 b, int rounding)

Synopsis

__m128d _mm_cvt_roundu64_sd (__m128d a, unsigned __int64 b, int rounding)
#include "immintrin.h"
Instruction: vcvtusi2sd xmm, xmm, r64 {er}
CPUID Flags: AVX512F

Description

Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_UnsignedInt64_To_FP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vcvtusi2ss
__m128 _mm_cvt_roundu64_ss (__m128 a, unsigned __int64 b, int rounding)

Synopsis

__m128 _mm_cvt_roundu64_ss (__m128 a, unsigned __int64 b, int rounding)
#include "immintrin.h"
Instruction: vcvtusi2ss xmm, xmm, r64 {er}
CPUID Flags: AVX512F

Description

Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_UnsignedInt64_To_FP32(b[63:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
cvtsi2ss
__m128 _mm_cvt_si2ss (__m128 a, int b)

Synopsis

__m128 _mm_cvt_si2ss (__m128 a, int b)
#include "xmmintrin.h"
Instruction: cvtsi2ss xmm, r32
CPUID Flags: SSE

Description

Convert the 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := Convert_Int32_To_FP32(b[31:0]) dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge51
Sandy Bridge51
Westmere31
Nehalem31
cvtss2si
int _mm_cvt_ss2si (__m128 a)

Synopsis

int _mm_cvt_ss2si (__m128 a)
#include "xmmintrin.h"
Instruction: cvtss2si r32, xmm
CPUID Flags: SSE

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.

Operation

dst[31:0] := Convert_FP32_To_Int32(a[31:0])
pmovsxwd
__m128i _mm_cvtepi16_epi32 (__m128i a)

Synopsis

__m128i _mm_cvtepi16_epi32 (__m128i a)
#include "smmintrin.h"
Instruction: pmovsxwd xmm, xmm
CPUID Flags: SSE4.1

Description

Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 16*j dst[i+31:i] := SignExtend(a[k+15:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmovsxwd
__m128i _mm_mask_cvtepi16_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepi16_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwd
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 l := j*16 IF k[j] dst[i+31:i] := SignExtend(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpmovsxwd
__m128i _mm_maskz_cvtepi16_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepi16_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwd
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[i+31:i] := SignExtend(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovsxwd
__m256i _mm256_cvtepi16_epi32 (__m128i a)

Synopsis

__m256i _mm256_cvtepi16_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxwd ymm, xmm
CPUID Flags: AVX2

Description

Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j:= 0 to 7 i := 32*j k := 16*j dst[i+31:i] := SignExtend(a[k+15:k]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpmovsxwd
__m256i _mm256_mask_cvtepi16_epi32 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_cvtepi16_epi32 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwd
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 l := j*16 IF k[j] dst[i+31:i] := SignExtend(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpmovsxwd
__m256i _mm256_maskz_cvtepi16_epi32 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_cvtepi16_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwd
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[i+31:i] := SignExtend(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovsxwd
__m512i _mm512_cvtepi16_epi32 (__m256i a)

Synopsis

__m512i _mm512_cvtepi16_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsxwd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j k := 16*j dst[i+31:i] := SignExtend(a[k+15:k]) ENDFOR dst[MAX:512] := 0
vpmovsxwd
__m512i _mm512_mask_cvtepi16_epi32 (__m512i src, __mmask16 k, __m256i a)

Synopsis

__m512i _mm512_mask_cvtepi16_epi32 (__m512i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsxwd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 l := j*16 IF k[j] dst[i+31:i] := SignExtend(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmovsxwd
__m512i _mm512_maskz_cvtepi16_epi32 (__mmask16 k, __m256i a)

Synopsis

__m512i _mm512_maskz_cvtepi16_epi32 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsxwd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[i+31:i] := SignExtend(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
pmovsxwq
__m128i _mm_cvtepi16_epi64 (__m128i a)

Synopsis

__m128i _mm_cvtepi16_epi64 (__m128i a)
#include "smmintrin.h"
Instruction: pmovsxwq xmm, xmm
CPUID Flags: SSE4.1

Description

Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 16*j dst[i+63:i] := SignExtend(a[k+15:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmovsxwq
__m128i _mm_mask_cvtepi16_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepi16_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[i+63:i] := SignExtend(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmovsxwq
__m128i _mm_maskz_cvtepi16_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepi16_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[i+63:i] := SignExtend(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovsxwq
__m256i _mm256_cvtepi16_epi64 (__m128i a)

Synopsis

__m256i _mm256_cvtepi16_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq ymm, xmm
CPUID Flags: AVX2

Description

Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j:= 0 to 3 i := 64*j k := 16*j dst[i+63:i] := SignExtend(a[k+15:k]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpmovsxwq
__m256i _mm256_mask_cvtepi16_epi64 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_cvtepi16_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[i+63:i] := SignExtend(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmovsxwq
__m256i _mm256_maskz_cvtepi16_epi64 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_cvtepi16_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[i+63:i] := SignExtend(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovsxwq
__m512i _mm512_cvtepi16_epi64 (__m128i a)

Synopsis

__m512i _mm512_cvtepi16_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 16*j dst[i+63:i] := SignExtend(a[k+15:k]) ENDFOR dst[MAX:512] := 0
vpmovsxwq
__m512i _mm512_mask_cvtepi16_epi64 (__m512i src, __mmask8 k, __m128i a)

Synopsis

__m512i _mm512_mask_cvtepi16_epi64 (__m512i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[i+63:i] := SignExtend(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmovsxwq
__m512i _mm512_maskz_cvtepi16_epi64 (__mmask8 k, __m128i a)

Synopsis

__m512i _mm512_maskz_cvtepi16_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[i+63:i] := SignExtend(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmovwb
__m128i _mm_cvtepi16_epi8 (__m128i a)

Synopsis

__m128i _mm_cvtepi16_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 16*j l := 8*j dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i]) ENDFOR dst[MAX:64] := 0
vpmovwb
__m128i _mm_mask_cvtepi16_epi8 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepi16_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
vpmovwb
__m128i _mm_maskz_cvtepi16_epi8 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepi16_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovwb
__m128i _mm256_cvtepi16_epi8 (__m256i a)

Synopsis

__m128i _mm256_cvtepi16_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 15 i := 16*j l := 8*j dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i]) ENDFOR dst[MAX:128] := 0
vpmovwb
__m128i _mm256_mask_cvtepi16_epi8 (__m128i src, __mmask16 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtepi16_epi8 (__m128i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
vpmovwb
__m128i _mm256_maskz_cvtepi16_epi8 (__mmask16 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtepi16_epi8 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovwb
__m256i _mm512_cvtepi16_epi8 (__m512i a)

Synopsis

__m256i _mm512_cvtepi16_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 31 i := 16*j l := 8*j dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i]) ENDFOR dst[MAX:256] := 0
vpmovwb
__m256i _mm512_mask_cvtepi16_epi8 (__m256i src, __mmask32 k, __m512i a)

Synopsis

__m256i _mm512_mask_cvtepi16_epi8 (__m256i src, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:256] := 0
vpmovwb
__m256i _mm512_maskz_cvtepi16_epi8 (__mmask32 k, __m512i a)

Synopsis

__m256i _mm512_maskz_cvtepi16_epi8 (__mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovwb
void _mm_mask_cvtepi16_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtepi16_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 16*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Truncate_Int16_To_Int8(a[i+15:i]) FI ENDFOR dst[MAX:64] := 0
vpmovwb
void _mm256_mask_cvtepi16_storeu_epi8 (void* base_addr, __mmask16 k, __m256i a)

Synopsis

void _mm256_mask_cvtepi16_storeu_epi8 (void* base_addr, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 15 i := 16*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Truncate_Int16_To_Int8(a[i+15:i]) FI ENDFOR dst[MAX:128] := 0
vpmovwb
void _mm512_mask_cvtepi16_storeu_epi8 (void* base_addr, __mmask32 k, __m512i a)

Synopsis

void _mm512_mask_cvtepi16_storeu_epi8 (void* base_addr, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 31 i := 16*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Truncate_Int16_To_Int8(a[i+15:i]) FI ENDFOR dst[MAX:256] := 0
vpmovdw
__m128i _mm_cvtepi32_epi16 (__m128i a)

Synopsis

__m128i _mm_cvtepi32_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 16*j dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:64] := 0
vpmovdw
__m128i _mm_mask_cvtepi32_epi16 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepi32_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:64] := 0
vpmovdw
__m128i _mm_maskz_cvtepi32_epi16 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepi32_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovdw
__m128i _mm256_cvtepi32_epi16 (__m256i a)

Synopsis

__m128i _mm256_cvtepi32_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 16*j dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:128] := 0
vpmovdw
__m128i _mm256_mask_cvtepi32_epi16 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtepi32_epi16 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
vpmovdw
__m128i _mm256_maskz_cvtepi32_epi16 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtepi32_epi16 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovdw
__m256i _mm512_cvtepi32_epi16 (__m512i a)

Synopsis

__m256i _mm512_cvtepi32_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpmovdw ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j k := 16*j dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:256] := 0
vpmovdw
__m256i _mm512_mask_cvtepi32_epi16 (__m256i src, __mmask16 k, __m512i a)

Synopsis

__m256i _mm512_mask_cvtepi32_epi16 (__m256i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovdw ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:256] := 0
vpmovdw
__m256i _mm512_maskz_cvtepi32_epi16 (__mmask16 k, __m512i a)

Synopsis

__m256i _mm512_maskz_cvtepi32_epi16 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovdw ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:256] := 0
pmovsxdq
__m128i _mm_cvtepi32_epi64 (__m128i a)

Synopsis

__m128i _mm_cvtepi32_epi64 (__m128i a)
#include "smmintrin.h"
Instruction: pmovsxdq xmm, xmm
CPUID Flags: SSE4.1

Description

Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 32*j dst[i+63:i] := SignExtend(a[k+31:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmovsxdq
__m128i _mm_mask_cvtepi32_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepi32_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxdq
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[i+63:i] := SignExtend(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmovsxdq
__m128i _mm_maskz_cvtepi32_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepi32_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxdq
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[i+63:i] := SignExtend(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovsxdq
__m256i _mm256_cvtepi32_epi64 (__m128i a)

Synopsis

__m256i _mm256_cvtepi32_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxdq ymm, xmm
CPUID Flags: AVX2

Description

Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j:= 0 to 3 i := 64*j k := 32*j dst[i+63:i] := SignExtend(a[k+31:k]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpmovsxdq
__m256i _mm256_mask_cvtepi32_epi64 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_cvtepi32_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxdq
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[i+63:i] := SignExtend(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmovsxdq
__m256i _mm256_maskz_cvtepi32_epi64 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_cvtepi32_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxdq
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[i+63:i] := SignExtend(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovsxdq
__m512i _mm512_cvtepi32_epi64 (__m256i a)

Synopsis

__m512i _mm512_cvtepi32_epi64 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsxdq zmm {k}, ymm
CPUID Flags: AVX512F

Description

Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 32*j dst[i+63:i] := SignExtend(a[k+31:k]) ENDFOR dst[MAX:512] := 0
vpmovsxdq
__m512i _mm512_mask_cvtepi32_epi64 (__m512i src, __mmask8 k, __m256i a)

Synopsis

__m512i _mm512_mask_cvtepi32_epi64 (__m512i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsxdq zmm {k}, ymm
CPUID Flags: AVX512F

Description

Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := SignExtend(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmovsxdq
__m512i _mm512_maskz_cvtepi32_epi64 (__mmask8 k, __m256i a)

Synopsis

__m512i _mm512_maskz_cvtepi32_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsxdq zmm {k}, ymm
CPUID Flags: AVX512F

Description

Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := SignExtend(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmovdb
__m128i _mm_cvtepi32_epi8 (__m128i a)

Synopsis

__m128i _mm_cvtepi32_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 8*j dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:32] := 0
vpmovdb
__m128i _mm_mask_cvtepi32_epi8 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepi32_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:32] := 0
vpmovdb
__m128i _mm_maskz_cvtepi32_epi8 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepi32_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:32] := 0
vpmovdb
__m128i _mm256_cvtepi32_epi8 (__m256i a)

Synopsis

__m128i _mm256_cvtepi32_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 8*j dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:64] := 0
vpmovdb
__m128i _mm256_mask_cvtepi32_epi8 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtepi32_epi8 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
vpmovdb
__m128i _mm256_maskz_cvtepi32_epi8 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtepi32_epi8 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovdb
__m128i _mm512_cvtepi32_epi8 (__m512i a)

Synopsis

__m128i _mm512_cvtepi32_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovdb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j k := 8*j dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:128] := 0
vpmovdb
__m128i _mm512_mask_cvtepi32_epi8 (__m128i src, __mmask16 k, __m512i a)

Synopsis

__m128i _mm512_mask_cvtepi32_epi8 (__m128i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovdb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
vpmovdb
__m128i _mm512_maskz_cvtepi32_epi8 (__mmask16 k, __m512i a)

Synopsis

__m128i _mm512_maskz_cvtepi32_epi8 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovdb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
cvtdq2pd
__m128d _mm_cvtepi32_pd (__m128i a)

Synopsis

__m128d _mm_cvtepi32_pd (__m128i a)
#include "emmintrin.h"
Instruction: cvtdq2pd xmm, xmm
CPUID Flags: SSE2

Description

Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*32 m := j*64 dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell40.8
Ivy Bridge41
Sandy Bridge41
Westmere41
Nehalem41
vcvtdq2pd
__m128d _mm_mask_cvtepi32_pd (__m128d src, __mmask8 k, __m128i a)

Synopsis

__m128d _mm_mask_cvtepi32_pd (__m128d src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtdq2pd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*32 m := j*64 IF k[j] dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE dst[m+63:m] := src[m+63:m] FI ENDFOR dst[MAX:128] := 0
vcvtdq2pd
__m128d _mm_maskz_cvtepi32_pd (__mmask8 k, __m128i a)

Synopsis

__m128d _mm_maskz_cvtepi32_pd (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtdq2pd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*32 m := j*64 IF k[j] dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE dst[m+63:m] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtdq2pd
__m256d _mm256_cvtepi32_pd (__m128i a)

Synopsis

__m256d _mm256_cvtepi32_pd (__m128i a)
#include "immintrin.h"
Instruction: vcvtdq2pd ymm, xmm
CPUID Flags: AVX

Description

Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 m := j*64 dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell41
Ivy Bridge41
Sandy Bridge41
vcvtdq2pd
__m256d _mm256_mask_cvtepi32_pd (__m256d src, __mmask8 k, __m128i a)

Synopsis

__m256d _mm256_mask_cvtepi32_pd (__m256d src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtdq2pd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 m := j*64 IF k[j] dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE dst[m+63:m] := src[m+63:m] FI ENDFOR dst[MAX:256] := 0
vcvtdq2pd
__m256d _mm256_maskz_cvtepi32_pd (__mmask8 k, __m128i a)

Synopsis

__m256d _mm256_maskz_cvtepi32_pd (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtdq2pd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 m := j*64 IF k[j] dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE dst[m+63:m] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtdq2pd
__m512d _mm512_cvtepi32_pd (__m256i a)

Synopsis

__m512d _mm512_cvtepi32_pd (__m256i a)
#include "immintrin.h"
Instruction: vcvtdq2pd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 m := j*64 dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvtdq2pd
__m512d _mm512_mask_cvtepi32_pd (__m512d src, __mmask8 k, __m256i a)

Synopsis

__m512d _mm512_mask_cvtepi32_pd (__m512d src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtdq2pd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 m := j*64 IF k[j] dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE dst[m+63:m] := src[m+63:m] FI ENDFOR dst[MAX:512] := 0
vcvtdq2pd
__m512d _mm512_maskz_cvtepi32_pd (__mmask8 k, __m256i a)

Synopsis

__m512d _mm512_maskz_cvtepi32_pd (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtdq2pd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 m := j*64 IF k[j] dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE dst[m+63:m] := 0 FI ENDFOR dst[MAX:512] := 0
cvtdq2ps
__m128 _mm_cvtepi32_ps (__m128i a)

Synopsis

__m128 _mm_cvtepi32_ps (__m128i a)
#include "emmintrin.h"
Instruction: cvtdq2ps xmm, xmm
CPUID Flags: SSE2

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcvtdq2ps
__m128 _mm_mask_cvtepi32_ps (__m128 src, __mmask8 k, __m128i a)

Synopsis

__m128 _mm_mask_cvtepi32_ps (__m128 src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtdq2ps
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vcvtdq2ps
__m128 _mm_maskz_cvtepi32_ps (__mmask8 k, __m128i a)

Synopsis

__m128 _mm_maskz_cvtepi32_ps (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtdq2ps
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtdq2ps
__m256 _mm256_cvtepi32_ps (__m256i a)

Synopsis

__m256 _mm256_cvtepi32_ps (__m256i a)
#include "immintrin.h"
Instruction: vcvtdq2ps ymm, ymm
CPUID Flags: AVX

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vcvtdq2ps
__m256 _mm256_mask_cvtepi32_ps (__m256 src, __mmask8 k, __m256i a)

Synopsis

__m256 _mm256_mask_cvtepi32_ps (__m256 src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtdq2ps
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvtdq2ps
__m256 _mm256_maskz_cvtepi32_ps (__mmask8 k, __m256i a)

Synopsis

__m256 _mm256_maskz_cvtepi32_ps (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtdq2ps
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtdq2ps
__m512 _mm512_cvtepi32_ps (__m512i a)

Synopsis

__m512 _mm512_cvtepi32_ps (__m512i a)
#include "immintrin.h"
Instruction: vcvtdq2ps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvtdq2ps
__m512 _mm512_mask_cvtepi32_ps (__m512 src, __mmask16 k, __m512i a)

Synopsis

__m512 _mm512_mask_cvtepi32_ps (__m512 src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtdq2ps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtdq2ps
__m512 _mm512_maskz_cvtepi32_ps (__mmask16 k, __m512i a)

Synopsis

__m512 _mm512_maskz_cvtepi32_ps (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtdq2ps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmovdw
void _mm_mask_cvtepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Truncate_Int32_To_Int16(a[i+31:i]) FI ENDFOR dst[MAX:64] := 0
vpmovdw
void _mm256_mask_cvtepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Truncate_Int32_To_Int16(a[i+31:i]) FI ENDFOR dst[MAX:128] := 0
vpmovdw
void _mm512_mask_cvtepi32_storeu_epi16 (void* base_addr, __mmask16 k, __m512i a)

Synopsis

void _mm512_mask_cvtepi32_storeu_epi16 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovdw m256 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Truncate_Int32_To_Int16(a[i+31:i]) FI ENDFOR
vpmovdb
void _mm_mask_cvtepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Truncate_Int32_To_Int8(a[i+31:i]) FI ENDFOR dst[MAX:32] := 0
vpmovdb
void _mm256_mask_cvtepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Truncate_Int32_To_Int8(a[i+31:i]) FI ENDFOR dst[MAX:64] := 0
vpmovdb
void _mm512_mask_cvtepi32_storeu_epi8 (void* base_addr, __mmask16 k, __m512i a)

Synopsis

void _mm512_mask_cvtepi32_storeu_epi8 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovdb m128 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Truncate_Int32_To_Int8(a[i+31:i]) FI ENDFOR
vcvtdq2pd
__m512d _mm512_cvtepi32lo_pd (__m512i v2)

Synopsis

__m512d _mm512_cvtepi32lo_pd (__m512i v2)
#include "immintrin.h"
Instruction: vcvtdq2pd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.

Operation

FOR j := 0 to 7 i := j*32 l := j*64 dst[l+63:l] := Int32ToFloat64(v2[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvtdq2pd
__m512d _mm512_mask_cvtepi32lo_pd (__m512d src, __mmask8 k, __m512i v2)

Synopsis

__m512d _mm512_mask_cvtepi32lo_pd (__m512d src, __mmask8 k, __m512i v2)
#include "immintrin.h"
Instruction: vcvtdq2pd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 n := j*64 IF k[j] dst[k+63:k] := Int32ToFloat64(v2[i+31:i]) ELSE dst[n+63:n] := src[n+63:n] FI ENDFOR dst[MAX:512] := 0
vpmovqw
__m128i _mm_cvtepi64_epi16 (__m128i a)

Synopsis

__m128i _mm_cvtepi64_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 16*j dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:32] := 0
vpmovqw
__m128i _mm_mask_cvtepi64_epi16 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepi64_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:32] := 0
vpmovqw
__m128i _mm_maskz_cvtepi64_epi16 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepi64_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:32] := 0
vpmovqw
__m128i _mm256_cvtepi64_epi16 (__m256i a)

Synopsis

__m128i _mm256_cvtepi64_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 16*j dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:64] := 0
vpmovqw
__m128i _mm256_mask_cvtepi64_epi16 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtepi64_epi16 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:64] := 0
vpmovqw
__m128i _mm256_maskz_cvtepi64_epi16 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtepi64_epi16 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovqw
__m128i _mm512_cvtepi64_epi16 (__m512i a)

Synopsis

__m128i _mm512_cvtepi64_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpmovqw xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 16*j dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vpmovqw
__m128i _mm512_mask_cvtepi64_epi16 (__m128i src, __mmask8 k, __m512i a)

Synopsis

__m128i _mm512_mask_cvtepi64_epi16 (__m128i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqw xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
vpmovqw
__m128i _mm512_maskz_cvtepi64_epi16 (__mmask8 k, __m512i a)

Synopsis

__m128i _mm512_maskz_cvtepi64_epi16 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqw xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovqd
__m128i _mm_cvtepi64_epi32 (__m128i a)

Synopsis

__m128i _mm_cvtepi64_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 32*j dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:64] := 0
vpmovqd
__m128i _mm_mask_cvtepi64_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepi64_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:64] := 0
vpmovqd
__m128i _mm_maskz_cvtepi64_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepi64_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovqd
__m128i _mm256_cvtepi64_epi32 (__m256i a)

Synopsis

__m128i _mm256_cvtepi64_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 32*j dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vpmovqd
__m128i _mm256_mask_cvtepi64_epi32 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtepi64_epi32 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:128] := 0
vpmovqd
__m128i _mm256_maskz_cvtepi64_epi32 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtepi64_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovqd
__m256i _mm512_cvtepi64_epi32 (__m512i a)

Synopsis

__m256i _mm512_cvtepi64_epi32 (__m512i a)
#include "immintrin.h"
Instruction: vpmovqd ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 32*j dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vpmovqd
__m256i _mm512_mask_cvtepi64_epi32 (__m256i src, __mmask8 k, __m512i a)

Synopsis

__m256i _mm512_mask_cvtepi64_epi32 (__m256i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqd ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:256] := 0
vpmovqd
__m256i _mm512_maskz_cvtepi64_epi32 (__mmask8 k, __m512i a)

Synopsis

__m256i _mm512_maskz_cvtepi64_epi32 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqd ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovqb
__m128i _mm_cvtepi64_epi8 (__m128i a)

Synopsis

__m128i _mm_cvtepi64_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 8*j dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vpmovqb
__m128i _mm_mask_cvtepi64_epi8 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepi64_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
vpmovqb
__m128i _mm_maskz_cvtepi64_epi8 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepi64_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovqb
__m128i _mm256_cvtepi64_epi8 (__m256i a)

Synopsis

__m128i _mm256_cvtepi64_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 8*j dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vpmovqb
__m128i _mm256_mask_cvtepi64_epi8 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtepi64_epi8 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
vpmovqb
__m128i _mm256_maskz_cvtepi64_epi8 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtepi64_epi8 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovqb
__m128i _mm512_cvtepi64_epi8 (__m512i a)

Synopsis

__m128i _mm512_cvtepi64_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovqb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 8*j dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vpmovqb
__m128i _mm512_mask_cvtepi64_epi8 (__m128i src, __mmask8 k, __m512i a)

Synopsis

__m128i _mm512_mask_cvtepi64_epi8 (__m128i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
vpmovqb
__m128i _mm512_maskz_cvtepi64_epi8 (__mmask8 k, __m512i a)

Synopsis

__m128i _mm512_maskz_cvtepi64_epi8 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtqq2pd
__m128d _mm_cvtepi64_pd (__m128i a)

Synopsis

__m128d _mm_cvtepi64_pd (__m128i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vcvtqq2pd
__m128d _mm_mask_cvtepi64_pd (__m128d src, __mmask8 k, __m128i a)

Synopsis

__m128d _mm_mask_cvtepi64_pd (__m128d src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vcvtqq2pd
__m128d _mm_maskz_cvtepi64_pd (__mmask8 k, __m128i a)

Synopsis

__m128d _mm_maskz_cvtepi64_pd (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtqq2pd
__m256d _mm256_cvtepi64_pd (__m256i a)

Synopsis

__m256d _mm256_cvtepi64_pd (__m256i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vcvtqq2pd
__m256d _mm256_mask_cvtepi64_pd (__m256d src, __mmask8 k, __m256i a)

Synopsis

__m256d _mm256_mask_cvtepi64_pd (__m256d src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vcvtqq2pd
__m256d _mm256_maskz_cvtepi64_pd (__mmask8 k, __m256i a)

Synopsis

__m256d _mm256_maskz_cvtepi64_pd (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtqq2pd
__m512d _mm512_cvtepi64_pd (__m512i a)

Synopsis

__m512d _mm512_cvtepi64_pd (__m512i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512DQ

Description

Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvtqq2pd
__m512d _mm512_mask_cvtepi64_pd (__m512d src, __mmask8 k, __m512i a)

Synopsis

__m512d _mm512_mask_cvtepi64_pd (__m512d src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512DQ

Description

Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtqq2pd
__m512d _mm512_maskz_cvtepi64_pd (__mmask8 k, __m512i a)

Synopsis

__m512d _mm512_maskz_cvtepi64_pd (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512DQ

Description

Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtqq2ps
__m128 _mm_cvtepi64_ps (__m128i a)

Synopsis

__m128 _mm_cvtepi64_ps (__m128i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 l := j*32 dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ENDFOR dst[MAX:64] := 0
vcvtqq2ps
__m128 _mm_mask_cvtepi64_ps (__m128 src, __mmask8 k, __m128i a)

Synopsis

__m128 _mm_mask_cvtepi64_ps (__m128 src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:64] := 0
vcvtqq2ps
__m128 _mm_maskz_cvtepi64_ps (__mmask8 k, __m128i a)

Synopsis

__m128 _mm_maskz_cvtepi64_ps (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:64] := 0
vcvtqq2ps
__m128 _mm256_cvtepi64_ps (__m256i a)

Synopsis

__m128 _mm256_cvtepi64_ps (__m256i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 l := j*32 dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vcvtqq2ps
__m128 _mm256_mask_cvtepi64_ps (__m128 src, __mmask8 k, __m256i a)

Synopsis

__m128 _mm256_mask_cvtepi64_ps (__m128 src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:128] := 0
vcvtqq2ps
__m128 _mm256_maskz_cvtepi64_ps (__mmask8 k, __m256i a)

Synopsis

__m128 _mm256_maskz_cvtepi64_ps (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtqq2ps
__m256 _mm512_cvtepi64_ps (__m512i a)

Synopsis

__m256 _mm512_cvtepi64_ps (__m512i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512DQ

Description

Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vcvtqq2ps
__m256 _mm512_mask_cvtepi64_ps (__m256 src, __mmask8 k, __m512i a)

Synopsis

__m256 _mm512_mask_cvtepi64_ps (__m256 src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512DQ

Description

Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:256] := 0
vcvtqq2ps
__m256 _mm512_maskz_cvtepi64_ps (__mmask8 k, __m512i a)

Synopsis

__m256 _mm512_maskz_cvtepi64_ps (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512DQ

Description

Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovqw
void _mm_mask_cvtepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Truncate_Int64_To_Int16(a[i+63:i]) FI ENDFOR dst[MAX:32] := 0
vpmovqw
void _mm256_mask_cvtepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Truncate_Int64_To_Int16(a[i+63:i]) FI ENDFOR dst[MAX:64] := 0
vpmovqw
void _mm512_mask_cvtepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m512i a)

Synopsis

void _mm512_mask_cvtepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqw m128 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Truncate_Int64_To_Int16(a[i+63:i]) FI ENDFOR
vpmovqd
void _mm_mask_cvtepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] MEM[base_addr+l+31:base_addr+l] := Truncate_Int64_To_Int32(a[i+63:i]) FI ENDFOR dst[MAX:64] := 0
vpmovqd
void _mm256_mask_cvtepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] MEM[base_addr+l+31:base_addr+l] := Truncate_Int64_To_Int32(a[i+63:i]) FI ENDFOR dst[MAX:128] := 0
vpmovqd
void _mm512_mask_cvtepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m512i a)

Synopsis

void _mm512_mask_cvtepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqd m256 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] MEM[base_addr+l+31:base_addr+l] := Truncate_Int64_To_Int32(a[i+63:i]) FI ENDFOR
vpmovqb
void _mm_mask_cvtepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Truncate_Int64_To_Int8(a[i+63:i]) FI ENDFOR dst[MAX:128] := 0
vpmovqb
void _mm256_mask_cvtepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Truncate_Int64_To_Int8(a[i+63:i]) FI ENDFOR dst[MAX:128] := 0
vpmovqb
void _mm512_mask_cvtepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m512i a)

Synopsis

void _mm512_mask_cvtepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqb m64 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Truncate_Int64_To_Int8(a[i+63:i]) FI ENDFOR
pmovsxbw
__m128i _mm_cvtepi8_epi16 (__m128i a)

Synopsis

__m128i _mm_cvtepi8_epi16 (__m128i a)
#include "smmintrin.h"
Instruction: pmovsxbw xmm, xmm
CPUID Flags: SSE4.1

Description

Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*8 l := j*16 dst[l+15:l] := SignExtend(a[i+7:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmovsxbw
__m128i _mm_mask_cvtepi8_epi16 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepi8_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512VL + AVX512BW

Description

Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*8 l := j*16 IF k[j] dst[l+15:l] := SignExtend(a[i+7:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
vpmovsxbw
__m128i _mm_maskz_cvtepi8_epi16 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepi8_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512VL + AVX512BW

Description

Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*8 l := j*16 IF k[j] dst[l+15:l] := SignExtend(a[i+7:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovsxbw
__m256i _mm256_cvtepi8_epi16 (__m128i a)

Synopsis

__m256i _mm256_cvtepi8_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxbw ymm, xmm
CPUID Flags: AVX2

Description

Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 l := j*16 dst[l+15:l] := SignExtend(a[i+7:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpmovsxbw
__m256i _mm256_mask_cvtepi8_epi16 (__m256i src, __mmask16 k, __m128i a)

Synopsis

__m256i _mm256_mask_cvtepi8_epi16 (__m256i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512VL + AVX512BW

Description

Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 l := j*16 IF k[j] dst[l+15:l] := SignExtend(a[i+7:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:256] := 0
vpmovsxbw
__m256i _mm256_maskz_cvtepi8_epi16 (__mmask16 k, __m128i a)

Synopsis

__m256i _mm256_maskz_cvtepi8_epi16 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512VL + AVX512BW

Description

Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 l := j*16 IF k[j] dst[l+15:l] := SignExtend(a[i+7:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovsxbw
__m512i _mm512_cvtepi8_epi16 (__m256i a)

Synopsis

__m512i _mm512_cvtepi8_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512BW

Description

Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 l := j*16 dst[l+15:l] := SignExtend(a[i+7:i]) ENDFOR dst[MAX:512] := 0
vpmovsxbw
__m512i _mm512_mask_cvtepi8_epi16 (__m512i src, __mmask32 k, __m256i a)

Synopsis

__m512i _mm512_mask_cvtepi8_epi16 (__m512i src, __mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512BW

Description

Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 l := j*16 IF k[j] dst[l+15:l] := SignExtend(a[i+7:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:512] := 0
vpmovsxbw
__m512i _mm512_maskz_cvtepi8_epi16 (__mmask32 k, __m256i a)

Synopsis

__m512i _mm512_maskz_cvtepi8_epi16 (__mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512BW

Description

Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 l := j*16 IF k[j] dst[l+15:l] := SignExtend(a[i+7:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:512] := 0
pmovsxbd
__m128i _mm_cvtepi8_epi32 (__m128i a)

Synopsis

__m128i _mm_cvtepi8_epi32 (__m128i a)
#include "smmintrin.h"
Instruction: pmovsxbd xmm, xmm
CPUID Flags: SSE4.1

Description

Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 8*j dst[i+31:i] := SignExtend(a[k+7:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmovsxbd
__m128i _mm_mask_cvtepi8_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepi8_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[i+31:i] := SignExtend(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpmovsxbd
__m128i _mm_maskz_cvtepi8_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepi8_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[i+31:i] := SignExtend(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovsxbd
__m256i _mm256_cvtepi8_epi32 (__m128i a)

Synopsis

__m256i _mm256_cvtepi8_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd ymm, xmm
CPUID Flags: AVX2

Description

Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 8*j dst[i+31:i] := SignExtend(a[k+7:k]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpmovsxbd
__m256i _mm256_mask_cvtepi8_epi32 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_cvtepi8_epi32 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[i+31:i] := SignExtend(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpmovsxbd
__m256i _mm256_maskz_cvtepi8_epi32 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_cvtepi8_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[i+31:i] := SignExtend(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovsxbd
__m512i _mm512_cvtepi8_epi32 (__m128i a)

Synopsis

__m512i _mm512_cvtepi8_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd zmm {k}, xmm
CPUID Flags: AVX512F

Description

Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j k := 8*j dst[i+31:i] := SignExtend(a[k+7:k]) ENDFOR dst[MAX:512] := 0
vpmovsxbd
__m512i _mm512_mask_cvtepi8_epi32 (__m512i src, __mmask16 k, __m128i a)

Synopsis

__m512i _mm512_mask_cvtepi8_epi32 (__m512i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd zmm {k}, xmm
CPUID Flags: AVX512F

Description

Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[i+31:i] := SignExtend(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmovsxbd
__m512i _mm512_maskz_cvtepi8_epi32 (__mmask16 k, __m128i a)

Synopsis

__m512i _mm512_maskz_cvtepi8_epi32 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd zmm {k}, xmm
CPUID Flags: AVX512F

Description

Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[i+31:i] := SignExtend(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
pmovsxbq
__m128i _mm_cvtepi8_epi64 (__m128i a)

Synopsis

__m128i _mm_cvtepi8_epi64 (__m128i a)
#include "smmintrin.h"
Instruction: pmovsxbq xmm, xmm
CPUID Flags: SSE4.1

Description

Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 8*j dst[i+63:i] := SignExtend(a[k+7:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmovsxbq
__m128i _mm_mask_cvtepi8_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepi8_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[i+63:i] := SignExtend(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmovsxbq
__m128i _mm_maskz_cvtepi8_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepi8_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[i+63:i] := SignExtend(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovsxbq
__m256i _mm256_cvtepi8_epi64 (__m128i a)

Synopsis

__m256i _mm256_cvtepi8_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq ymm, xmm
CPUID Flags: AVX2

Description

Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 8*j dst[i+63:i] := SignExtend(a[k+7:k]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpmovsxbq
__m256i _mm256_mask_cvtepi8_epi64 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_cvtepi8_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[i+63:i] := SignExtend(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmovsxbq
__m256i _mm256_maskz_cvtepi8_epi64 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_cvtepi8_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq
CPUID Flags: AVX512VL + AVX512F

Description

Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[i+63:i] := SignExtend(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovsxbq
__m512i _mm512_cvtepi8_epi64 (__m128i a)

Synopsis

__m512i _mm512_cvtepi8_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 8*j dst[i+63:i] := SignExtend(a[k+7:k]) ENDFOR dst[MAX:512] := 0
vpmovsxbq
__m512i _mm512_mask_cvtepi8_epi64 (__m512i src, __mmask8 k, __m128i a)

Synopsis

__m512i _mm512_mask_cvtepi8_epi64 (__m512i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[i+63:i] := SignExtend(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmovsxbq
__m512i _mm512_maskz_cvtepi8_epi64 (__mmask8 k, __m128i a)

Synopsis

__m512i _mm512_maskz_cvtepi8_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[i+63:i] := SignExtend(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
pmovzxwd
__m128i _mm_cvtepu16_epi32 (__m128i a)

Synopsis

__m128i _mm_cvtepu16_epi32 (__m128i a)
#include "smmintrin.h"
Instruction: pmovzxwd xmm, xmm
CPUID Flags: SSE4.1

Description

Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 16*j dst[i+31:i] := ZeroExtend(a[k+15:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmovzxwd
__m128i _mm_mask_cvtepu16_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepu16_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwd
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpmovzxwd
__m128i _mm_maskz_cvtepu16_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepu16_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwd
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovzxwd
__m256i _mm256_cvtepu16_epi32 (__m128i a)

Synopsis

__m256i _mm256_cvtepu16_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxwd ymm, xmm
CPUID Flags: AVX2

Description

Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 16*j dst[i+31:i] := ZeroExtend(a[k+15:k]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpmovzxwd
__m256i _mm256_mask_cvtepu16_epi32 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_cvtepu16_epi32 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwd
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpmovzxwd
__m256i _mm256_maskz_cvtepu16_epi32 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_cvtepu16_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwd
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovzxwd
__m512i _mm512_cvtepu16_epi32 (__m256i a)

Synopsis

__m512i _mm512_cvtepu16_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpmovzxwd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j k := 16*j dst[i+31:i] := ZeroExtend(a[k+15:k]) ENDFOR dst[MAX:512] := 0
vpmovzxwd
__m512i _mm512_mask_cvtepu16_epi32 (__m512i src, __mmask16 k, __m256i a)

Synopsis

__m512i _mm512_mask_cvtepu16_epi32 (__m512i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovzxwd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmovzxwd
__m512i _mm512_maskz_cvtepu16_epi32 (__mmask16 k, __m256i a)

Synopsis

__m512i _mm512_maskz_cvtepu16_epi32 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovzxwd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
pmovzxwq
__m128i _mm_cvtepu16_epi64 (__m128i a)

Synopsis

__m128i _mm_cvtepu16_epi64 (__m128i a)
#include "smmintrin.h"
Instruction: pmovzxwq xmm, xmm
CPUID Flags: SSE4.1

Description

Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 16*j dst[i+63:i] := ZeroExtend(a[k+15:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmovzxwq
__m128i _mm_mask_cvtepu16_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepu16_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmovzxwq
__m128i _mm_maskz_cvtepu16_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepu16_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovzxwq
__m256i _mm256_cvtepu16_epi64 (__m128i a)

Synopsis

__m256i _mm256_cvtepu16_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq ymm, xmm
CPUID Flags: AVX2

Description

Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j:= 0 to 3 i := 64*j k := 16*j dst[i+63:i] := ZeroExtend(a[k+15:k]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpmovzxwq
__m256i _mm256_mask_cvtepu16_epi64 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_cvtepu16_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmovzxwq
__m256i _mm256_maskz_cvtepu16_epi64 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_cvtepu16_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovzxwq
__m512i _mm512_cvtepu16_epi64 (__m128i a)

Synopsis

__m512i _mm512_cvtepu16_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 16*j dst[i+63:i] := ZeroExtend(a[k+15:k]) ENDFOR dst[MAX:512] := 0
vpmovzxwq
__m512i _mm512_mask_cvtepu16_epi64 (__m512i src, __mmask8 k, __m128i a)

Synopsis

__m512i _mm512_mask_cvtepu16_epi64 (__m512i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmovzxwq
__m512i _mm512_maskz_cvtepu16_epi64 (__mmask8 k, __m128i a)

Synopsis

__m512i _mm512_maskz_cvtepu16_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
pmovzxdq
__m128i _mm_cvtepu32_epi64 (__m128i a)

Synopsis

__m128i _mm_cvtepu32_epi64 (__m128i a)
#include "smmintrin.h"
Instruction: pmovzxdq xmm, xmm
CPUID Flags: SSE4.1

Description

Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 32*j dst[i+63:i] := ZeroExtend(a[k+31:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmovzxdq
__m128i _mm_mask_cvtepu32_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepu32_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxdq
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmovzxdq
__m128i _mm_maskz_cvtepu32_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepu32_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxdq
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovzxdq
__m256i _mm256_cvtepu32_epi64 (__m128i a)

Synopsis

__m256i _mm256_cvtepu32_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxdq ymm, xmm
CPUID Flags: AVX2

Description

Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j:= 0 to 3 i := 64*j k := 32*j dst[i+63:i] := ZeroExtend(a[k+31:k]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpmovzxdq
__m256i _mm256_mask_cvtepu32_epi64 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_cvtepu32_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxdq
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmovzxdq
__m256i _mm256_maskz_cvtepu32_epi64 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_cvtepu32_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxdq
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovzxdq
__m512i _mm512_cvtepu32_epi64 (__m256i a)

Synopsis

__m512i _mm512_cvtepu32_epi64 (__m256i a)
#include "immintrin.h"
Instruction: vpmovzxdq zmm {k}, ymm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 32*j dst[i+63:i] := ZeroExtend(a[k+31:k]) ENDFOR dst[MAX:512] := 0
vpmovzxdq
__m512i _mm512_mask_cvtepu32_epi64 (__m512i src, __mmask8 k, __m256i a)

Synopsis

__m512i _mm512_mask_cvtepu32_epi64 (__m512i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovzxdq zmm {k}, ymm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmovzxdq
__m512i _mm512_maskz_cvtepu32_epi64 (__mmask8 k, __m256i a)

Synopsis

__m512i _mm512_maskz_cvtepu32_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovzxdq zmm {k}, ymm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtudq2pd
__m128d _mm_cvtepu32_pd (__m128i a)

Synopsis

__m128d _mm_cvtepu32_pd (__m128i a)
#include "immintrin.h"
Instruction: vcvtudq2pd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 l := j*32 dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ENDFOR dst[MAX:128] := 0
vcvtudq2pd
__m128d _mm_mask_cvtepu32_pd (__m128d src, __mmask8 k, __m128i a)

Synopsis

__m128d _mm_mask_cvtepu32_pd (__m128d src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtudq2pd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vcvtudq2pd
__m128d _mm_maskz_cvtepu32_pd (__mmask8 k, __m128i a)

Synopsis

__m128d _mm_maskz_cvtepu32_pd (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtudq2pd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtudq2pd
__m256d _mm256_cvtepu32_pd (__m128i a)

Synopsis

__m256d _mm256_cvtepu32_pd (__m128i a)
#include "immintrin.h"
Instruction: vcvtudq2pd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 l := j*32 dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ENDFOR dst[MAX:256] := 0
vcvtudq2pd
__m256d _mm256_mask_cvtepu32_pd (__m256d src, __mmask8 k, __m128i a)

Synopsis

__m256d _mm256_mask_cvtepu32_pd (__m256d src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtudq2pd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vcvtudq2pd
__m256d _mm256_maskz_cvtepu32_pd (__mmask8 k, __m128i a)

Synopsis

__m256d _mm256_maskz_cvtepu32_pd (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtudq2pd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtudq2pd
__m512d _mm512_cvtepu32_pd (__m256i a)

Synopsis

__m512d _mm512_cvtepu32_pd (__m256i a)
#include "immintrin.h"
Instruction: vcvtudq2pd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ENDFOR dst[MAX:512] := 0
vcvtudq2pd
__m512d _mm512_mask_cvtepu32_pd (__m512d src, __mmask8 k, __m256i a)

Synopsis

__m512d _mm512_mask_cvtepu32_pd (__m512d src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtudq2pd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtudq2pd
__m512d _mm512_maskz_cvtepu32_pd (__mmask8 k, __m256i a)

Synopsis

__m512d _mm512_maskz_cvtepu32_pd (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtudq2pd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtudq2ps
__m512 _mm512_cvtepu32_ps (__m512i a)

Synopsis

__m512 _mm512_cvtepu32_ps (__m512i a)
#include "immintrin.h"
Instruction: vcvtudq2ps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvtudq2ps
__m512 _mm512_mask_cvtepu32_ps (__m512 src, __mmask16 k, __m512i a)

Synopsis

__m512 _mm512_mask_cvtepu32_ps (__m512 src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtudq2ps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtudq2ps
__m512 _mm512_maskz_cvtepu32_ps (__mmask16 k, __m512i a)

Synopsis

__m512 _mm512_maskz_cvtepu32_ps (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtudq2ps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtudq2pd
__m512d _mm512_cvtepu32lo_pd (__m512i v2)

Synopsis

__m512d _mm512_cvtepu32lo_pd (__m512i v2)
#include "immintrin.h"
Instruction: vcvtudq2pd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.

Operation

FOR j := 0 to 7 i := j*32 k := j*64 dst[k+63:k] := UInt32ToFloat64(v2[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvtudq2pd
__m512d _mm512_mask_cvtepu32lo_pd (__m512d src, __mmask8 k, __m512i v2)

Synopsis

__m512d _mm512_mask_cvtepu32lo_pd (__m512d src, __mmask8 k, __m512i v2)
#include "immintrin.h"
Instruction: vcvtudq2pd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[l+63:l] := UInt32ToFloat64(v2[i+31:i]) ELSE dst[l+63:l] := src[l+63:l] FI ENDFOR dst[MAX:512] := 0
vcvtuqq2pd
__m128d _mm_cvtepu64_pd (__m128i a)

Synopsis

__m128d _mm_cvtepu64_pd (__m128i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vcvtuqq2pd
__m128d _mm_mask_cvtepu64_pd (__m128d src, __mmask8 k, __m128i a)

Synopsis

__m128d _mm_mask_cvtepu64_pd (__m128d src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vcvtuqq2pd
__m128d _mm_maskz_cvtepu64_pd (__mmask8 k, __m128i a)

Synopsis

__m128d _mm_maskz_cvtepu64_pd (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtuqq2pd
__m256d _mm256_cvtepu64_pd (__m256i a)

Synopsis

__m256d _mm256_cvtepu64_pd (__m256i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vcvtuqq2pd
__m256d _mm256_mask_cvtepu64_pd (__m256d src, __mmask8 k, __m256i a)

Synopsis

__m256d _mm256_mask_cvtepu64_pd (__m256d src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vcvtuqq2pd
__m256d _mm256_maskz_cvtepu64_pd (__mmask8 k, __m256i a)

Synopsis

__m256d _mm256_maskz_cvtepu64_pd (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtuqq2pd
__m512d _mm512_cvtepu64_pd (__m512i a)

Synopsis

__m512d _mm512_cvtepu64_pd (__m512i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvtuqq2pd
__m512d _mm512_mask_cvtepu64_pd (__m512d src, __mmask8 k, __m512i a)

Synopsis

__m512d _mm512_mask_cvtepu64_pd (__m512d src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtuqq2pd
__m512d _mm512_maskz_cvtepu64_pd (__mmask8 k, __m512i a)

Synopsis

__m512d _mm512_maskz_cvtepu64_pd (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtuqq2ps
__m128 _mm_cvtepu64_ps (__m128i a)

Synopsis

__m128 _mm_cvtepu64_ps (__m128i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 l := j*32 dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i]) ENDFOR dst[MAX:64] := 0
vcvtuqq2ps
__m128 _mm_mask_cvtepu64_ps (__m128 src, __mmask8 k, __m128i a)

Synopsis

__m128 _mm_mask_cvtepu64_ps (__m128 src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:64] := 0
vcvtuqq2ps
__m128 _mm_maskz_cvtepu64_ps (__mmask8 k, __m128i a)

Synopsis

__m128 _mm_maskz_cvtepu64_ps (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:64] := 0
vcvtuqq2ps
__m128 _mm256_cvtepu64_ps (__m256i a)

Synopsis

__m128 _mm256_cvtepu64_ps (__m256i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 l := j*32 dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vcvtuqq2ps
__m128 _mm256_mask_cvtepu64_ps (__m128 src, __mmask8 k, __m256i a)

Synopsis

__m128 _mm256_mask_cvtepu64_ps (__m128 src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:128] := 0
vcvtuqq2ps
__m128 _mm256_maskz_cvtepu64_ps (__mmask8 k, __m256i a)

Synopsis

__m128 _mm256_maskz_cvtepu64_ps (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtuqq2ps
__m256 _mm512_cvtepu64_ps (__m512i a)

Synopsis

__m256 _mm512_cvtepu64_ps (__m512i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vcvtuqq2ps
__m256 _mm512_mask_cvtepu64_ps (__m256 src, __mmask8 k, __m512i a)

Synopsis

__m256 _mm512_mask_cvtepu64_ps (__m256 src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:256] := 0
vcvtuqq2ps
__m256 _mm512_maskz_cvtepu64_ps (__mmask8 k, __m512i a)

Synopsis

__m256 _mm512_maskz_cvtepu64_ps (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512DQ

Description

Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:256] := 0
pmovzxbw
__m128i _mm_cvtepu8_epi16 (__m128i a)

Synopsis

__m128i _mm_cvtepu8_epi16 (__m128i a)
#include "smmintrin.h"
Instruction: pmovzxbw xmm, xmm
CPUID Flags: SSE4.1

Description

Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*8 l := j*16 dst[l+15:l] := ZeroExtend(a[i+7:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmovzxbw
__m128i _mm_mask_cvtepu8_epi16 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepu8_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512VL + AVX512BW

Description

Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*8 l := j*16 IF k[j] dst[l+15:l] := ZeroExtend(a[i+7:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
vpmovzxbw
__m128i _mm_maskz_cvtepu8_epi16 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepu8_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512VL + AVX512BW

Description

Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*8 l := j*16 IF k[j] dst[l+15:l] := ZeroExtend(a[i+7:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovzxbw
__m256i _mm256_cvtepu8_epi16 (__m128i a)

Synopsis

__m256i _mm256_cvtepu8_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxbw ymm, xmm
CPUID Flags: AVX2

Description

Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 l := j*16 dst[l+15:l] := ZeroExtend(a[i+7:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpmovzxbw
__m256i _mm256_mask_cvtepu8_epi16 (__m256i src, __mmask16 k, __m128i a)

Synopsis

__m256i _mm256_mask_cvtepu8_epi16 (__m256i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512VL + AVX512BW

Description

Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 l := j*16 IF k[j] dst[l+15:l] := ZeroExtend(a[i+7:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:256] := 0
vpmovzxbw
__m256i _mm256_maskz_cvtepu8_epi16 (__mmask16 k, __m128i a)

Synopsis

__m256i _mm256_maskz_cvtepu8_epi16 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512VL + AVX512BW

Description

Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 l := j*16 IF k[j] dst[l+15:l] := ZeroExtend(a[i+7:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovzxbw
__m512i _mm512_cvtepu8_epi16 (__m256i a)

Synopsis

__m512i _mm512_cvtepu8_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512BW

Description

Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 l := j*16 dst[l+15:l] := ZeroExtend(a[i+7:i]) ENDFOR dst[MAX:512] := 0
vpmovzxbw
__m512i _mm512_mask_cvtepu8_epi16 (__m512i src, __mmask32 k, __m256i a)

Synopsis

__m512i _mm512_mask_cvtepu8_epi16 (__m512i src, __mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512BW

Description

Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 l := j*16 IF k[j] dst[l+15:l] := ZeroExtend(a[i+7:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:512] := 0
vpmovzxbw
__m512i _mm512_maskz_cvtepu8_epi16 (__mmask32 k, __m256i a)

Synopsis

__m512i _mm512_maskz_cvtepu8_epi16 (__mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512BW

Description

Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 l := j*16 IF k[j] dst[l+15:l] := ZeroExtend(a[i+7:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:512] := 0
pmovzxbd
__m128i _mm_cvtepu8_epi32 (__m128i a)

Synopsis

__m128i _mm_cvtepu8_epi32 (__m128i a)
#include "smmintrin.h"
Instruction: pmovzxbd xmm, xmm
CPUID Flags: SSE4.1

Description

Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 8*j dst[i+31:i] := ZeroExtend(a[k+7:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmovzxbd
__m128i _mm_mask_cvtepu8_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepu8_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpmovzxbd
__m128i _mm_maskz_cvtepu8_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepu8_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 8-bit integers in th elow 4 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovzxbd
__m256i _mm256_cvtepu8_epi32 (__m128i a)

Synopsis

__m256i _mm256_cvtepu8_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd ymm, xmm
CPUID Flags: AVX2

Description

Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 8*j dst[i+31:i] := ZeroExtend(a[k+7:k]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpmovzxbd
__m256i _mm256_mask_cvtepu8_epi32 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_cvtepu8_epi32 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpmovzxbd
__m256i _mm256_maskz_cvtepu8_epi32 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_cvtepu8_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovzxbd
__m512i _mm512_cvtepu8_epi32 (__m128i a)

Synopsis

__m512i _mm512_cvtepu8_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd zmm {k}, xmm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j k := 8*j dst[i+31:i] := ZeroExtend(a[k+7:k]) ENDFOR dst[MAX:512] := 0
vpmovzxbd
__m512i _mm512_mask_cvtepu8_epi32 (__m512i src, __mmask16 k, __m128i a)

Synopsis

__m512i _mm512_mask_cvtepu8_epi32 (__m512i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd zmm {k}, xmm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmovzxbd
__m512i _mm512_maskz_cvtepu8_epi32 (__mmask16 k, __m128i a)

Synopsis

__m512i _mm512_maskz_cvtepu8_epi32 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd zmm {k}, xmm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
pmovzxbq
__m128i _mm_cvtepu8_epi64 (__m128i a)

Synopsis

__m128i _mm_cvtepu8_epi64 (__m128i a)
#include "smmintrin.h"
Instruction: pmovzxbq xmm, xmm
CPUID Flags: SSE4.1

Description

Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 8*j dst[i+63:i] := ZeroExtend(a[k+7:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmovzxbq
__m128i _mm_mask_cvtepu8_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtepu8_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmovzxbq
__m128i _mm_maskz_cvtepu8_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtepu8_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovzxbq
__m256i _mm256_cvtepu8_epi64 (__m128i a)

Synopsis

__m256i _mm256_cvtepu8_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq ymm, xmm
CPUID Flags: AVX2

Description

Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 8*j dst[i+63:i] := ZeroExtend(a[k+7:k]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpmovzxbq
__m256i _mm256_mask_cvtepu8_epi64 (__m256i src, __mmask8 k, __m128i a)

Synopsis

__m256i _mm256_mask_cvtepu8_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmovzxbq
__m256i _mm256_maskz_cvtepu8_epi64 (__mmask8 k, __m128i a)

Synopsis

__m256i _mm256_maskz_cvtepu8_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq
CPUID Flags: AVX512VL + AVX512F

Description

Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovzxbq
__m512i _mm512_cvtepu8_epi64 (__m128i a)

Synopsis

__m512i _mm512_cvtepu8_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 8*j dst[i+63:i] := ZeroExtend(a[k+7:k]) ENDFOR dst[MAX:512] := 0
vpmovzxbq
__m512i _mm512_mask_cvtepu8_epi64 (__m512i src, __mmask8 k, __m128i a)

Synopsis

__m512i _mm512_mask_cvtepu8_epi64 (__m512i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmovzxbq
__m512i _mm512_maskz_cvtepu8_epi64 (__mmask8 k, __m128i a)

Synopsis

__m512i _mm512_maskz_cvtepu8_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq zmm {k}, xmm
CPUID Flags: AVX512F

Description

Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtfxpntdq2ps
__m512 _mm512_cvtfxpnt_round_adjustepi32_ps (__m512i v2, int rounding, _MM_EXP_ADJ_ENUM expadj)

Synopsis

__m512 _mm512_cvtfxpnt_round_adjustepi32_ps (__m512i v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vcvtfxpntdq2ps zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Performs element-by-element conversion of packed 32-bit integer elements in v2 to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using expadj, storing the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := Int32ToFloat32(v2[i+31:i]) CASE expadj OF _MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0 _MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4 _MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5 _MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8 _MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16 _MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24 _MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31 _MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32 ESAC ENDFOR dst[MAX:512] := 0
vcvtfxpntudq2ps
__m512 _mm512_cvtfxpnt_round_adjustepu32_ps (__m512i v2, int rounding, _MM_EXP_ADJ_ENUM expadj)

Synopsis

__m512 _mm512_cvtfxpnt_round_adjustepu32_ps (__m512i v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vcvtfxpntudq2ps zmm {k}, zmm, imm
CPUID Flags: KNCNI

Description

Performs element-by-element conversion of packed 32-bit unsigned integer elements in v2 to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using expadj, storing the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := UInt32ToFloat32(v2[i+31:i]) CASE expadj OF _MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0 _MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4 _MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5 _MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8 _MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16 _MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24 _MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31 _MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32 ESAC ENDFOR dst[MAX:512] := 0
vcvtfxpntudq2ps
__m512 _mm512_mask_cvtfxpnt_round_adjustepu32_ps (__m512 src, __mmask16 k, __m512i v2, int rounding, _MM_EXP_ADJ_ENUM expadj)

Synopsis

__m512 _mm512_mask_cvtfxpnt_round_adjustepu32_ps (__m512 src, __mmask16 k, __m512i v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vcvtfxpntudq2ps zmm {k}, zmm, imm
CPUID Flags: KNCNI

Description

Performs element-by-element conversion of packed 32-bit unsigned integer elements in v2 to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using expadj, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Int32ToFloat32(v2[i+31:i]) CASE expadj OF _MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0 _MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4 _MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5 _MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8 _MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16 _MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24 _MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31 _MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32 ESAC ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtfxpntps2dq
__m512i _mm512_cvtfxpnt_round_adjustps_epi32 (__m512 v2, int rounding, _MM_EXP_ADJ_ENUM expadj)

Synopsis

__m512i _mm512_cvtfxpnt_round_adjustps_epi32 (__m512 v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vcvtfxpntps2dq zmm {k}, zmm, imm
CPUID Flags: KNCNI

Description

Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in v2 to packed 32-bit integer elements and performs an optional exponent adjust using expadj, storing the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := Float32ToInt32(v2[i+31:i]) CASE expadj OF _MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0 _MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4 _MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5 _MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8 _MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16 _MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24 _MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31 _MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32 ESAC ENDFOR dst[MAX:512] := 0
vcvtfxpntps2udq
__m512i _mm512_cvtfxpnt_round_adjustps_epu32 (__m512 v2, int rounding, _MM_EXP_ADJ_ENUM expadj)

Synopsis

__m512i _mm512_cvtfxpnt_round_adjustps_epu32 (__m512 v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vcvtfxpntps2udq zmm {k}, zmm, imm
CPUID Flags: KNCNI

Description

Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in v2 to packed 32-bit unsigned integer elements and performing an optional exponent adjust using expadj, storing the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := Float32ToUInt32(v2[i+31:i]) CASE expadj OF _MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] 0 _MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] 4 _MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] 5 _MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] 8 _MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] 16 _MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] 24 _MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] 31 _MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] 32 ESAC ENDFOR dst[MAX:512] := 0
vcvtfxpntpd2dq
__m512i _mm512_cvtfxpnt_roundpd_epi32lo (__m512d v2, int rounding)

Synopsis

__m512i _mm512_cvtfxpnt_roundpd_epi32lo (__m512d v2, int rounding)
#include "immintrin.h"
Instruction: vcvtfxpntpd2dq zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector v2 to 32-bit integer elements, storing them in the lower half of dst. The elements in the upper half of dst are set to 0.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 k := j*32 dst[k+31:k] := Float64ToInt32(v2[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvtfxpntpd2dq
__m512i _mm512_mask_cvtfxpnt_roundpd_epi32lo (__m512i src, __mmask8 k, __m512d v2, int rounding)

Synopsis

__m512i _mm512_mask_cvtfxpnt_roundpd_epi32lo (__m512i src, __mmask8 k, __m512d v2, int rounding)
#include "immintrin.h"
Instruction: vcvtfxpntpd2dq zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector v2 to 32-bit integer elements, storing them in the lower half of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The elements in the upper half of dst are set to 0.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[l+31:l] := Float64ToInt32(v2[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:512] := 0
vcvtfxpntpd2udq
__m512i _mm512_cvtfxpnt_roundpd_epu32lo (__m512d v2, int rounding)

Synopsis

__m512i _mm512_cvtfxpnt_roundpd_epu32lo (__m512d v2, int rounding)
#include "immintrin.h"
Instruction: vcvtfxpntpd2udq zmm {k}, zmm, imm
CPUID Flags: KNCNI

Description

Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to packed 32-bit unsigned integer elements, storing the results in dst. Results are written to the lower half of dst, and the upper half locations are set to '0'.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 k := j*32 dst[k+31:k] := Float64ToInt32(v2[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvtfxpntpd2udq
__m512i _mm512_mask_cvtfxpnt_roundpd_epu32lo (__m512i src, __mmask8 k, __m512d v2, int rounding)

Synopsis

__m512i _mm512_mask_cvtfxpnt_roundpd_epu32lo (__m512i src, __mmask8 k, __m512d v2, int rounding)
#include "immintrin.h"
Instruction: vcvtfxpntpd2udq zmm {k}, zmm, imm
CPUID Flags: KNCNI

Description

Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to packed 32-bit unsigned integer elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Results are written to the lower half of dst, and the upper half locations are set to '0'.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[l+31:l] := Float64ToInt32(v2[i+63:i]) ELSE dst[l+31:l] := src[l+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtsi2sd
__m128d _mm_cvti32_sd (__m128d a, int b)

Synopsis

__m128d _mm_cvti32_sd (__m128d a, int b)
#include "immintrin.h"
Instruction: vcvtsi2sd xmm, xmm, r32
CPUID Flags: AVX512F

Description

Convert the 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := Convert_Int32_To_FP64(b[31:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vcvtsi2ss
__m128 _mm_cvti32_ss (__m128 a, int b)

Synopsis

__m128 _mm_cvti32_ss (__m128 a, int b)
#include "immintrin.h"
Instruction: vcvtsi2ss xmm, xmm, r32
CPUID Flags: AVX512F

Description

Convert the 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := Convert_Int32_To_FP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vcvtsi2sd
__m128d _mm_cvti64_sd (__m128d a, __int64 b)

Synopsis

__m128d _mm_cvti64_sd (__m128d a, __int64 b)
#include "immintrin.h"
Instruction: vcvtsi2sd xmm, xmm, r64
CPUID Flags: AVX512F

Description

Convert the 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := Convert_Int64_To_FP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vcvtsi2ss
__m128 _mm_cvti64_ss (__m128 a, __int64 b)

Synopsis

__m128 _mm_cvti64_ss (__m128 a, __int64 b)
#include "immintrin.h"
Instruction: vcvtsi2ss xmm, xmm, r64
CPUID Flags: AVX512F

Description

Convert the 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := Convert_Int64_To_FP32(b[63:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
cvtpd2dq
__m128i _mm_cvtpd_epi32 (__m128d a)

Synopsis

__m128i _mm_cvtpd_epi32 (__m128d a)
#include "emmintrin.h"
Instruction: cvtpd2dq xmm, xmm
CPUID Flags: SSE2

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell40.8
Ivy Bridge41
Sandy Bridge41
Westmere41
Nehalem41
vcvtpd2dq
__m128i _mm_mask_cvtpd_epi32 (__m128i src, __mmask8 k, __m128d a)

Synopsis

__m128i _mm_mask_cvtpd_epi32 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:64] := 0
vcvtpd2dq
__m128i _mm_maskz_cvtpd_epi32 (__mmask8 k, __m128d a)

Synopsis

__m128i _mm_maskz_cvtpd_epi32 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:64] := 0
vcvtpd2dq
__m128i _mm256_cvtpd_epi32 (__m256d a)

Synopsis

__m128i _mm256_cvtpd_epi32 (__m256d a)
#include "immintrin.h"
Instruction: vcvtpd2dq xmm, ymm
CPUID Flags: AVX

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell41
Ivy Bridge41
Sandy Bridge41
vcvtpd2dq
__m128i _mm256_mask_cvtpd_epi32 (__m128i src, __mmask8 k, __m256d a)

Synopsis

__m128i _mm256_mask_cvtpd_epi32 (__m128i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vcvtpd2dq
__m128i _mm256_maskz_cvtpd_epi32 (__mmask8 k, __m256d a)

Synopsis

__m128i _mm256_maskz_cvtpd_epi32 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtpd2dq
__m256i _mm512_cvtpd_epi32 (__m512d a)

Synopsis

__m256i _mm512_cvtpd_epi32 (__m512d a)
#include "immintrin.h"
Instruction: vcvtpd2dq ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR dst[MAX:256] := 0
vcvtpd2dq
__m256i _mm512_mask_cvtpd_epi32 (__m256i src, __mmask8 k, __m512d a)

Synopsis

__m256i _mm512_mask_cvtpd_epi32 (__m256i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2dq ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvtpd2dq
__m256i _mm512_maskz_cvtpd_epi32 (__mmask8 k, __m512d a)

Synopsis

__m256i _mm512_maskz_cvtpd_epi32 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2dq ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtpd2qq
__m128i _mm_cvtpd_epi64 (__m128d a)

Synopsis

__m128i _mm_cvtpd_epi64 (__m128d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vcvtpd2qq
__m128i _mm_mask_cvtpd_epi64 (__m128i src, __mmask8 k, __m128d a)

Synopsis

__m128i _mm_mask_cvtpd_epi64 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vcvtpd2qq
__m128i _mm_maskz_cvtpd_epi64 (__mmask8 k, __m128d a)

Synopsis

__m128i _mm_maskz_cvtpd_epi64 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtpd2qq
__m256i _mm256_cvtpd_epi64 (__m256d a)

Synopsis

__m256i _mm256_cvtpd_epi64 (__m256d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vcvtpd2qq
__m256i _mm256_mask_cvtpd_epi64 (__m256i src, __mmask8 k, __m256d a)

Synopsis

__m256i _mm256_mask_cvtpd_epi64 (__m256i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vcvtpd2qq
__m256i _mm256_maskz_cvtpd_epi64 (__mmask8 k, __m256d a)

Synopsis

__m256i _mm256_maskz_cvtpd_epi64 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtpd2qq
__m512i _mm512_cvtpd_epi64 (__m512d a)

Synopsis

__m512i _mm512_cvtpd_epi64 (__m512d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvtpd2qq
__m512i _mm512_mask_cvtpd_epi64 (__m512i src, __mmask8 k, __m512d a)

Synopsis

__m512i _mm512_mask_cvtpd_epi64 (__m512i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtpd2qq
__m512i _mm512_maskz_cvtpd_epi64 (__mmask8 k, __m512d a)

Synopsis

__m512i _mm512_maskz_cvtpd_epi64 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtpd2udq
__m128i _mm_cvtpd_epu32 (__m128d a)

Synopsis

__m128i _mm_cvtpd_epu32 (__m128d a)
#include "immintrin.h"
Instruction: vcvtpd2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k]) ENDFOR dst[MAX:64] := 0
vcvtpd2udq
__m128i _mm_mask_cvtpd_epu32 (__m128i src, __mmask8 k, __m128d a)

Synopsis

__m128i _mm_mask_cvtpd_epu32 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:64] := 0
vcvtpd2udq
__m128i _mm_maskz_cvtpd_epu32 (__mmask8 k, __m128d a)

Synopsis

__m128i _mm_maskz_cvtpd_epu32 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:64] := 0
vcvtpd2udq
__m128i _mm256_cvtpd_epu32 (__m256d a)

Synopsis

__m128i _mm256_cvtpd_epu32 (__m256d a)
#include "immintrin.h"
Instruction: vcvtpd2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k]) ENDFOR dst[MAX:128] := 0
vcvtpd2udq
__m128i _mm256_mask_cvtpd_epu32 (__m128i src, __mmask8 k, __m256d a)

Synopsis

__m128i _mm256_mask_cvtpd_epu32 (__m128i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vcvtpd2udq
__m128i _mm256_maskz_cvtpd_epu32 (__mmask8 k, __m256d a)

Synopsis

__m128i _mm256_maskz_cvtpd_epu32 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtpd2udq
__m256i _mm512_cvtpd_epu32 (__m512d a)

Synopsis

__m256i _mm512_cvtpd_epu32 (__m512d a)
#include "immintrin.h"
Instruction: vcvtpd2udq ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k]) ENDFOR dst[MAX:256] := 0
vcvtpd2udq
__m256i _mm512_mask_cvtpd_epu32 (__m256i src, __mmask8 k, __m512d a)

Synopsis

__m256i _mm512_mask_cvtpd_epu32 (__m256i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2udq ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvtpd2udq
__m256i _mm512_maskz_cvtpd_epu32 (__mmask8 k, __m512d a)

Synopsis

__m256i _mm512_maskz_cvtpd_epu32 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2udq ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtpd2uqq
__m128i _mm_cvtpd_epu64 (__m128d a)

Synopsis

__m128i _mm_cvtpd_epu64 (__m128d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vcvtpd2uqq
__m128i _mm_mask_cvtpd_epu64 (__m128i src, __mmask8 k, __m128d a)

Synopsis

__m128i _mm_mask_cvtpd_epu64 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vcvtpd2uqq
__m128i _mm_maskz_cvtpd_epu64 (__mmask8 k, __m128d a)

Synopsis

__m128i _mm_maskz_cvtpd_epu64 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtpd2uqq
__m256i _mm256_cvtpd_epu64 (__m256d a)

Synopsis

__m256i _mm256_cvtpd_epu64 (__m256d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vcvtpd2uqq
__m256i _mm256_mask_cvtpd_epu64 (__m256i src, __mmask8 k, __m256d a)

Synopsis

__m256i _mm256_mask_cvtpd_epu64 (__m256i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vcvtpd2uqq
__m256i _mm256_maskz_cvtpd_epu64 (__mmask8 k, __m256d a)

Synopsis

__m256i _mm256_maskz_cvtpd_epu64 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtpd2uqq
__m512i _mm512_cvtpd_epu64 (__m512d a)

Synopsis

__m512i _mm512_cvtpd_epu64 (__m512d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvtpd2uqq
__m512i _mm512_mask_cvtpd_epu64 (__m512i src, __mmask8 k, __m512d a)

Synopsis

__m512i _mm512_mask_cvtpd_epu64 (__m512i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtpd2uqq
__m512i _mm512_maskz_cvtpd_epu64 (__mmask8 k, __m512d a)

Synopsis

__m512i _mm512_maskz_cvtpd_epu64 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
cvtpd2pi
__m64 _mm_cvtpd_pi32 (__m128d a)

Synopsis

__m64 _mm_cvtpd_pi32 (__m128d a)
#include "emmintrin.h"
Instruction: cvtpd2pi mm, xmm
CPUID Flags: SSE2

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR
cvtpd2ps
__m128 _mm_cvtpd_ps (__m128d a)

Synopsis

__m128 _mm_cvtpd_ps (__m128d a)
#include "emmintrin.h"
Instruction: cvtpd2ps xmm, xmm
CPUID Flags: SSE2

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 1 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell40.8
Ivy Bridge41
Sandy Bridge41
Westmere41
Nehalem41
vcvtpd2ps
__m128 _mm_mask_cvtpd_ps (__m128 src, __mmask8 k, __m128d a)

Synopsis

__m128 _mm_mask_cvtpd_ps (__m128 src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2ps
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:64] := 0
vcvtpd2ps
__m128 _mm_maskz_cvtpd_ps (__mmask8 k, __m128d a)

Synopsis

__m128 _mm_maskz_cvtpd_ps (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2ps
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:64] := 0
vcvtpd2ps
__m128 _mm256_cvtpd_ps (__m256d a)

Synopsis

__m128 _mm256_cvtpd_ps (__m256d a)
#include "immintrin.h"
Instruction: vcvtpd2ps xmm, ymm
CPUID Flags: AVX

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell41
Ivy Bridge41
Sandy Bridge41
vcvtpd2ps
__m128 _mm256_mask_cvtpd_ps (__m128 src, __mmask8 k, __m256d a)

Synopsis

__m128 _mm256_mask_cvtpd_ps (__m128 src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2ps
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vcvtpd2ps
__m128 _mm256_maskz_cvtpd_ps (__mmask8 k, __m256d a)

Synopsis

__m128 _mm256_maskz_cvtpd_ps (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2ps
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtpd2ps
__m256 _mm512_cvtpd_ps (__m512d a)

Synopsis

__m256 _mm512_cvtpd_ps (__m512d a)
#include "immintrin.h"
Instruction: vcvtpd2ps ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) ENDFOR dst[MAX:256] := 0
vcvtpd2ps
__m256 _mm512_mask_cvtpd_ps (__m256 src, __mmask8 k, __m512d a)

Synopsis

__m256 _mm512_mask_cvtpd_ps (__m256 src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2ps ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvtpd2ps
__m256 _mm512_maskz_cvtpd_ps (__mmask8 k, __m512d a)

Synopsis

__m256 _mm512_maskz_cvtpd_ps (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2ps ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtpd2ps
__m512 _mm512_cvtpd_pslo (__m512d v2)

Synopsis

__m512 _mm512_cvtpd_pslo (__m512d v2)
#include "immintrin.h"
Instruction: vcvtpd2ps zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst. The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.

Operation

FOR j := 0 to 7 i := j*64 k := j*32 dst[k+31:k] := Float64ToFloat32(v2[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvtpd2ps
__m512 _mm512_mask_cvtpd_pslo (__m512 src, __mmask8 k, __m512d v2)

Synopsis

__m512 _mm512_mask_cvtpd_pslo (__m512 src, __mmask8 k, __m512d v2)
#include "immintrin.h"
Instruction: vcvtpd2ps zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[l+31:l] := Float64ToFloat32(v2[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:512] := 0
vcvtph2ps
__m128 _mm_cvtph_ps (__m128i a)

Synopsis

__m128 _mm_cvtph_ps (__m128i a)
#include "emmintrin.h"
Instruction: vcvtph2ps xmm, xmm
CPUID Flags: FP16C

Description

Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 m := j*16 dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell4-
Haswell41
Ivy Bridge7-
Ivy Bridge71
vcvtph2ps
__m128 _mm_mask_cvtph_ps (__m128 src, __mmask8 k, __m128i a)

Synopsis

__m128 _mm_mask_cvtph_ps (__m128 src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtph2ps
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vcvtph2ps
__m128 _mm_maskz_cvtph_ps (__mmask8 k, __m128i a)

Synopsis

__m128 _mm_maskz_cvtph_ps (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtph2ps
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtph2ps
__m256 _mm256_cvtph_ps (__m128i a)

Synopsis

__m256 _mm256_cvtph_ps (__m128i a)
#include "immintrin.h"
Instruction: vcvtph2ps ymm, xmm
CPUID Flags: FP16C

Description

Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 m := j*16 dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell61
Haswell6-
Ivy Bridge71
Ivy Bridge7-
vcvtph2ps
__m256 _mm256_mask_cvtph_ps (__m256 src, __mmask8 k, __m128i a)

Synopsis

__m256 _mm256_mask_cvtph_ps (__m256 src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtph2ps
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvtph2ps
__m256 _mm256_maskz_cvtph_ps (__mmask8 k, __m128i a)

Synopsis

__m256 _mm256_maskz_cvtph_ps (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtph2ps
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtph2ps
__m512 _mm512_cvtph_ps (__m256i a)

Synopsis

__m512 _mm512_cvtph_ps (__m256i a)
#include "immintrin.h"
Instruction: vcvtph2ps zmm {k}, ymm
CPUID Flags: AVX512F

Description

Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 m := j*16 dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ENDFOR dst[MAX:512] := 0
vcvtph2ps
__m512 _mm512_mask_cvtph_ps (__m512 src, __mmask16 k, __m256i a)

Synopsis

__m512 _mm512_mask_cvtph_ps (__m512 src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtph2ps zmm {k}, ymm
CPUID Flags: AVX512F

Description

Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtph2ps
__m512 _mm512_maskz_cvtph_ps (__mmask16 k, __m256i a)

Synopsis

__m512 _mm512_maskz_cvtph_ps (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtph2ps zmm {k}, ymm
CPUID Flags: AVX512F

Description

Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_cvtpi16_ps (__m64 a)

Synopsis

__m128 _mm_cvtpi16_ps (__m64 a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Convert packed 16-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*16 m := j*32 dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) ENDFOR
cvtpi2pd
__m128d _mm_cvtpi32_pd (__m64 a)

Synopsis

__m128d _mm_cvtpi32_pd (__m64 a)
#include "emmintrin.h"
Instruction: cvtpi2pd xmm, mm
CPUID Flags: SSE2

Description

Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*32 m := j*64 dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ENDFOR
cvtpi2ps
__m128 _mm_cvtpi32_ps (__m128 a, __m64 b)

Synopsis

__m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
#include "xmmintrin.h"
Instruction: cvtpi2ps xmm, mm
CPUID Flags: SSE

Description

Convert packed 32-bit integers in b to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of dst, and copy the upper 2 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := Convert_Int32_To_FP32(b[31:0]) dst[63:32] := Convert_Int32_To_FP32(b[63:32]) dst[95:64] := a[95:64] dst[127:96] := a[127:96]

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
...
__m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)

Synopsis

__m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of dst, then covert the packed 32-bit integers in a to single-precision (32-bit) floating-point element, and store the results in the upper 2 elements of dst.

Operation

dst[31:0] := Convert_Int32_To_FP32(a[31:0]) dst[63:32] := Convert_Int32_To_FP32(a[63:32]) dst[95:64] := Convert_Int32_To_FP32(b[31:0]) dst[127:96] := Convert_Int32_To_FP32(b[63:32])
...
__m128 _mm_cvtpi8_ps (__m64 a)

Synopsis

__m128 _mm_cvtpi8_ps (__m64 a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Convert the lower packed 8-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*8 m := j*32 dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) ENDFOR
cvtps2dq
__m128i _mm_cvtps_epi32 (__m128 a)

Synopsis

__m128i _mm_cvtps_epi32 (__m128 a)
#include "emmintrin.h"
Instruction: cvtps2dq xmm, xmm
CPUID Flags: SSE2

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcvtps2dq
__m128i _mm_mask_cvtps_epi32 (__m128i src, __mmask8 k, __m128 a)

Synopsis

__m128i _mm_mask_cvtps_epi32 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vcvtps2dq
__m128i _mm_maskz_cvtps_epi32 (__mmask8 k, __m128 a)

Synopsis

__m128i _mm_maskz_cvtps_epi32 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtps2dq
__m256i _mm256_cvtps_epi32 (__m256 a)

Synopsis

__m256i _mm256_cvtps_epi32 (__m256 a)
#include "immintrin.h"
Instruction: vcvtps2dq ymm, ymm
CPUID Flags: AVX

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vcvtps2dq
__m256i _mm256_mask_cvtps_epi32 (__m256i src, __mmask8 k, __m256 a)

Synopsis

__m256i _mm256_mask_cvtps_epi32 (__m256i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvtps2dq
__m256i _mm256_maskz_cvtps_epi32 (__mmask8 k, __m256 a)

Synopsis

__m256i _mm256_maskz_cvtps_epi32 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtps2dq
__m512i _mm512_cvtps_epi32 (__m512 a)

Synopsis

__m512i _mm512_cvtps_epi32 (__m512 a)
#include "immintrin.h"
Instruction: vcvtps2dq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvtps2dq
__m512i _mm512_mask_cvtps_epi32 (__m512i src, __mmask16 k, __m512 a)

Synopsis

__m512i _mm512_mask_cvtps_epi32 (__m512i src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvtps2dq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtps2dq
__m512i _mm512_maskz_cvtps_epi32 (__mmask16 k, __m512 a)

Synopsis

__m512i _mm512_maskz_cvtps_epi32 (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvtps2dq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtps2qq
__m128i _mm_cvtps_epi64 (__m128 a)

Synopsis

__m128i _mm_cvtps_epi64 (__m128 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ENDFOR dst[MAX:128] := 0
vcvtps2qq
__m128i _mm_mask_cvtps_epi64 (__m128i src, __mmask8 k, __m128 a)

Synopsis

__m128i _mm_mask_cvtps_epi64 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vcvtps2qq
__m128i _mm_maskz_cvtps_epi64 (__mmask8 k, __m128 a)

Synopsis

__m128i _mm_maskz_cvtps_epi64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtps2qq
__m256i _mm256_cvtps_epi64 (__m128 a)

Synopsis

__m256i _mm256_cvtps_epi64 (__m128 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ENDFOR dst[MAX:256] := 0
vcvtps2qq
__m256i _mm256_mask_cvtps_epi64 (__m256i src, __mmask8 k, __m128 a)

Synopsis

__m256i _mm256_mask_cvtps_epi64 (__m256i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vcvtps2qq
__m256i _mm256_maskz_cvtps_epi64 (__mmask8 k, __m128 a)

Synopsis

__m256i _mm256_maskz_cvtps_epi64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtps2qq
__m512i _mm512_cvtps_epi64 (__m256 a)

Synopsis

__m512i _mm512_cvtps_epi64 (__m256 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ENDFOR dst[MAX:512] := 0
vcvtps2qq
__m512i _mm512_mask_cvtps_epi64 (__m512i src, __mmask8 k, __m256 a)

Synopsis

__m512i _mm512_mask_cvtps_epi64 (__m512i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtps2qq
__m512i _mm512_maskz_cvtps_epi64 (__mmask8 k, __m256 a)

Synopsis

__m512i _mm512_maskz_cvtps_epi64 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtps2udq
__m128i _mm_cvtps_epu32 (__m128 a)

Synopsis

__m128i _mm_cvtps_epu32 (__m128 a)
#include "immintrin.h"
Instruction: vcvtps2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ENDFOR dst[MAX:128] := 0
vcvtps2udq
__m128i _mm_mask_cvtps_epu32 (__m128i src, __mmask8 k, __m128 a)

Synopsis

__m128i _mm_mask_cvtps_epu32 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vcvtps2udq
__m128i _mm_maskz_cvtps_epu32 (__mmask8 k, __m128 a)

Synopsis

__m128i _mm_maskz_cvtps_epu32 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtps2udq
__m256i _mm256_cvtps_epu32 (__m256 a)

Synopsis

__m256i _mm256_cvtps_epu32 (__m256 a)
#include "immintrin.h"
Instruction: vcvtps2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ENDFOR dst[MAX:256] := 0
vcvtps2udq
__m256i _mm256_mask_cvtps_epu32 (__m256i src, __mmask8 k, __m256 a)

Synopsis

__m256i _mm256_mask_cvtps_epu32 (__m256i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvtps2udq
__m256i _mm256_maskz_cvtps_epu32 (__mmask8 k, __m256 a)

Synopsis

__m256i _mm256_maskz_cvtps_epu32 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtps2udq
__m512i _mm512_cvtps_epu32 (__m512 a)

Synopsis

__m512i _mm512_cvtps_epu32 (__m512 a)
#include "immintrin.h"
Instruction: vcvtps2udq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvtps2udq
__m512i _mm512_mask_cvtps_epu32 (__m512i src, __mmask16 k, __m512 a)

Synopsis

__m512i _mm512_mask_cvtps_epu32 (__m512i src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvtps2udq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvtps2udq
__m512i _mm512_maskz_cvtps_epu32 (__mmask16 k, __m512 a)

Synopsis

__m512i _mm512_maskz_cvtps_epu32 (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvtps2udq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtps2uqq
__m128i _mm_cvtps_epu64 (__m128 a)

Synopsis

__m128i _mm_cvtps_epu64 (__m128 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l]) ENDFOR dst[MAX:128] := 0
vcvtps2uqq
__m128i _mm_mask_cvtps_epu64 (__m128i src, __mmask8 k, __m128 a)

Synopsis

__m128i _mm_mask_cvtps_epu64 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vcvtps2uqq
__m128i _mm_maskz_cvtps_epu64 (__mmask8 k, __m128 a)

Synopsis

__m128i _mm_maskz_cvtps_epu64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtps2uqq
__m256i _mm256_cvtps_epu64 (__m128 a)

Synopsis

__m256i _mm256_cvtps_epu64 (__m128 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l]) ENDFOR dst[MAX:256] := 0
vcvtps2uqq
__m256i _mm256_mask_cvtps_epu64 (__m256i src, __mmask8 k, __m128 a)

Synopsis

__m256i _mm256_mask_cvtps_epu64 (__m256i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vcvtps2uqq
__m256i _mm256_maskz_cvtps_epu64 (__mmask8 k, __m128 a)

Synopsis

__m256i _mm256_maskz_cvtps_epu64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvtps2uqq
__m512i _mm512_cvtps_epu64 (__m256 a)

Synopsis

__m512i _mm512_cvtps_epu64 (__m256 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l]) ENDFOR dst[MAX:512] := 0
vcvtps2uqq
__m512i _mm512_mask_cvtps_epu64 (__m512i src, __mmask8 k, __m256 a)

Synopsis

__m512i _mm512_mask_cvtps_epu64 (__m512i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtps2uqq
__m512i _mm512_maskz_cvtps_epu64 (__mmask8 k, __m256 a)

Synopsis

__m512i _mm512_maskz_cvtps_epu64 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
cvtps2pd
__m128d _mm_cvtps_pd (__m128 a)

Synopsis

__m128d _mm_cvtps_pd (__m128 a)
#include "emmintrin.h"
Instruction: cvtps2pd xmm, xmm
CPUID Flags: SSE2

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 32*j dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell20.8
Ivy Bridge21
Sandy Bridge21
Westmere21
Nehalem21
vcvtps2pd
__m256d _mm256_cvtps_pd (__m128 a)

Synopsis

__m256d _mm256_cvtps_pd (__m128 a)
#include "immintrin.h"
Instruction: vcvtps2pd ymm, xmm
CPUID Flags: AVX

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 32*j dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell21
Ivy Bridge21
Sandy Bridge21
vcvtps2pd
__m512d _mm512_cvtps_pd (__m256 a)

Synopsis

__m512d _mm512_cvtps_pd (__m256 a)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 32*j dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) ENDFOR dst[MAX:512] := 0
vcvtps2pd
__m512d _mm512_mask_cvtps_pd (__m512d src, __mmask8 k, __m256 a)

Synopsis

__m512d _mm512_mask_cvtps_pd (__m512d src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvtps2pd
__m512d _mm512_maskz_cvtps_pd (__mmask8 k, __m256 a)

Synopsis

__m512d _mm512_maskz_cvtps_pd (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, ymm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvtps2ph
__m128i _mm_cvtps_ph (__m128 a, int rounding)

Synopsis

__m128i _mm_cvtps_ph (__m128 a, int rounding)
#include "emmintrin.h"
Instruction: vcvtps2ph xmm, xmm, imm
CPUID Flags: FP16C

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 3 i := 16*j l := 32*j dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell4-
Haswell41
Ivy Bridge10-
Ivy Bridge101
vcvtps2ph
__m128i _mm_mask_cvtps_ph (__m128i src, __mmask8 k, __m128 a, int rounding)

Synopsis

__m128i _mm_mask_cvtps_ph (__m128i src, __mmask8 k, __m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 3 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:64] := 0
vcvtps2ph
__m128i _mm_maskz_cvtps_ph (__mmask8 k, __m128 a, int rounding)

Synopsis

__m128i _mm_maskz_cvtps_ph (__mmask8 k, __m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 3 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:64] := 0
vcvtps2ph
__m128i _mm256_cvtps_ph (__m256 a, int rounding)

Synopsis

__m128i _mm256_cvtps_ph (__m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph xmm, ymm, imm
CPUID Flags: FP16C

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 16*j l := 32*j dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell61
Haswell6-
Ivy Bridge101
Ivy Bridge10-
vcvtps2ph
__m128i _mm256_mask_cvtps_ph (__m128i src, __mmask8 k, __m256 a, int rounding)

Synopsis

__m128i _mm256_mask_cvtps_ph (__m128i src, __mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vcvtps2ph
__m128i _mm256_maskz_cvtps_ph (__mmask8 k, __m256 a, int rounding)

Synopsis

__m128i _mm256_maskz_cvtps_ph (__mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvtps2ph
__m256i _mm512_cvtps_ph (__m512 a, int rounding)

Synopsis

__m256i _mm512_cvtps_ph (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph ymm {k}, zmm {sae}, imm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 16*j l := 32*j dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ENDFOR dst[MAX:256] := 0
vcvtps2ph
__m256i _mm512_mask_cvtps_ph (__m256i src, __mmask16 k, __m512 a, int rounding)

Synopsis

__m256i _mm512_mask_cvtps_ph (__m256i src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph ymm {k}, zmm {sae}, imm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vcvtps2ph
__m256i _mm512_maskz_cvtps_ph (__mmask16 k, __m512 a, int rounding)

Synopsis

__m256i _mm512_maskz_cvtps_ph (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph ymm {k}, zmm {sae}, imm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
...
__m64 _mm_cvtps_pi16 (__m128 a)

Synopsis

__m64 _mm_cvtps_pi16 (__m128 a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 16-bit integers, and store the results in dst.

Operation

FOR j := 0 to 3 i := 16*j k := 32*j dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k]) ENDFOR
cvtps2pi
__m64 _mm_cvtps_pi32 (__m128 a)

Synopsis

__m64 _mm_cvtps_pi32 (__m128 a)
#include "xmmintrin.h"
Instruction: cvtps2pi mm, xmm
CPUID Flags: SSE

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.

Operation

FOR j := 0 to 1 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
...
__m64 _mm_cvtps_pi8 (__m128 a)

Synopsis

__m64 _mm_cvtps_pi8 (__m128 a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 8-bit integers, and store the results in lower 4 elements of dst.

Operation

FOR j := 0 to 3 i := 8*j k := 32*j dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k]) ENDFOR
vcvtps2pd
__m512d _mm512_cvtpslo_pd (__m512 v2)

Synopsis

__m512d _mm512_cvtpslo_pd (__m512 v2)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.

Operation

FOR j := 0 to 7 i := j*32 k := j*64 dst[k+63:k] := Float32ToFloat64(v2[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvtps2pd
__m512d _mm512_mask_cvtpslo_pd (__m512d src, __mmask8 k, __m512 v2)

Synopsis

__m512d _mm512_mask_cvtpslo_pd (__m512d src, __mmask8 k, __m512 v2)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[l+63:l] := Float32ToFloat64(v2[i+31:i]) ELSE dst[l+63:l] := src[l+63:l]: FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_cvtpu16_ps (__m64 a)

Synopsis

__m128 _mm_cvtpu16_ps (__m64 a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Convert packed unsigned 16-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*16 m := j*32 dst[m+31:m] := Convert_UnsignedInt16_To_FP32(a[i+15:i]) ENDFOR
...
__m128 _mm_cvtpu8_ps (__m64 a)

Synopsis

__m128 _mm_cvtpu8_ps (__m64 a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Convert the lower packed unsigned 8-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*8 m := j*32 dst[m+31:m] := Convert_UnsignedInt8_To_FP32(a[i+7:i]) ENDFOR
movsd
double _mm_cvtsd_f64 (__m128d a)

Synopsis

double _mm_cvtsd_f64 (__m128d a)
#include "emmintrin.h"
Instruction: movsd m64, xmm
CPUID Flags: SSE2

Description

Copy the lower double-precision (64-bit) floating-point element of a to dst.

Operation

dst[63:0] := a[63:0]
vcvtsd2si
int _mm_cvtsd_i32 (__m128d a)

Synopsis

int _mm_cvtsd_i32 (__m128d a)
#include "immintrin.h"
Instruction: vcvtsd2si r32, xmm
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.

Operation

dst[31:0] := Convert_FP64_To_Int32(a[63:0])
vcvtsd2si
__int64 _mm_cvtsd_i64 (__m128d a)

Synopsis

__int64 _mm_cvtsd_i64 (__m128d a)
#include "immintrin.h"
Instruction: vcvtsd2si r64, xmm
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.

Operation

dst[63:0] := Convert_FP64_To_Int64(a[63:0])
cvtsd2si
int _mm_cvtsd_si32 (__m128d a)

Synopsis

int _mm_cvtsd_si32 (__m128d a)
#include "emmintrin.h"
Instruction: cvtsd2si r32, xmm
CPUID Flags: SSE2

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.

Operation

dst[31:0] := Convert_FP64_To_Int32(a[63:0])

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge51
Sandy Bridge51
cvtsd2si
__int64 _mm_cvtsd_si64 (__m128d a)

Synopsis

__int64 _mm_cvtsd_si64 (__m128d a)
#include "emmintrin.h"
Instruction: cvtsd2si r64, xmm
CPUID Flags: SSE2

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.

Operation

dst[63:0] := Convert_FP64_To_Int64(a[63:0])

Performance

ArchitectureLatencyThroughput
Haswell50.8
Ivy Bridge51
Sandy Bridge51
Westmere41
Nehalem41
cvtsd2si
__int64 _mm_cvtsd_si64x (__m128d a)

Synopsis

__int64 _mm_cvtsd_si64x (__m128d a)
#include "emmintrin.h"
Instruction: cvtsd2si r64, xmm
CPUID Flags: SSE2

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.

Operation

dst[63:0] := Convert_FP64_To_Int64(a[63:0])

Performance

ArchitectureLatencyThroughput
Haswell50.8
Ivy Bridge51
Sandy Bridge51
Westmere41
Nehalem41
cvtsd2ss
__m128 _mm_cvtsd_ss (__m128 a, __m128d b)

Synopsis

__m128 _mm_cvtsd_ss (__m128 a, __m128d b)
#include "emmintrin.h"
Instruction: cvtsd2ss xmm, xmm
CPUID Flags: SSE2

Description

Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[31:0] := Convert_FP64_To_FP32(b[63:0]) dst[127:32] := a[127:31] dst[MAX:64] := 0

Performance

ArchitectureLatencyThroughput
Haswell41
Ivy Bridge41
Sandy Bridge41
Westmere41
Nehalem41
vcvtsd2ss
__m128 _mm_mask_cvtsd_ss (__m128 src, __mmask8 k, __m128 a, __m128d b)

Synopsis

__m128 _mm_mask_cvtsd_ss (__m128 src, __mmask8 k, __m128 a, __m128d b)
#include "immintrin.h"
Instruction: vcvtsd2ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[31:0] := Convert_FP64_To_FP32(b[63:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:31] dst[MAX:64] := 0
vcvtsd2ss
__m128 _mm_maskz_cvtsd_ss (__mmask8 k, __m128 a, __m128d b)

Synopsis

__m128 _mm_maskz_cvtsd_ss (__mmask8 k, __m128 a, __m128d b)
#include "immintrin.h"
Instruction: vcvtsd2ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := Convert_FP64_To_FP32(b[63:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:31] dst[MAX:64] := 0
vcvtsd2usi
unsigned int _mm_cvtsd_u32 (__m128d a)

Synopsis

unsigned int _mm_cvtsd_u32 (__m128d a)
#include "immintrin.h"
Instruction: vcvtsd2usi r32, xmm
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.

Operation

dst[31:0] := Convert_FP64_To_UnsignedInt32(a[63:0])
vcvtsd2usi
unsigned __int64 _mm_cvtsd_u64 (__m128d a)

Synopsis

unsigned __int64 _mm_cvtsd_u64 (__m128d a)
#include "immintrin.h"
Instruction: vcvtsd2usi r64, xmm
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.

Operation

dst[63:0] := Convert_FP64_To_UnsignedInt64(a[63:0])
vpmovswb
__m128i _mm_cvtsepi16_epi8 (__m128i a)

Synopsis

__m128i _mm_cvtsepi16_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 16*j l := 8*j dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i]) ENDFOR dst[MAX:64] := 0
vpmovswb
__m128i _mm_mask_cvtsepi16_epi8 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtsepi16_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
vpmovswb
__m128i _mm_maskz_cvtsepi16_epi8 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtsepi16_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovswb
__m128i _mm256_cvtsepi16_epi8 (__m256i a)

Synopsis

__m128i _mm256_cvtsepi16_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := 16*j l := 8*j dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i]) ENDFOR dst[MAX:128] := 0
vpmovswb
__m128i _mm256_mask_cvtsepi16_epi8 (__m128i src, __mmask16 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtsepi16_epi8 (__m128i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
vpmovswb
__m128i _mm256_maskz_cvtsepi16_epi8 (__mmask16 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtsepi16_epi8 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovswb
__m256i _mm512_cvtsepi16_epi8 (__m512i a)

Synopsis

__m256i _mm512_cvtsepi16_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 31 i := 16*j l := 8*j dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i]) ENDFOR dst[MAX:256] := 0
vpmovswb
__m256i _mm512_mask_cvtsepi16_epi8 (__m256i src, __mmask32 k, __m512i a)

Synopsis

__m256i _mm512_mask_cvtsepi16_epi8 (__m256i src, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:256] := 0
vpmovswb
__m256i _mm512_maskz_cvtsepi16_epi8 (__mmask32 k, __m512i a)

Synopsis

__m256i _mm512_maskz_cvtsepi16_epi8 (__mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovswb
void _mm_mask_cvtsepi16_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtsepi16_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 16*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_Int16_To_Int8(a[i+15:i]) FI ENDFOR dst[MAX:64] := 0
vpmovswb
void _mm256_mask_cvtsepi16_storeu_epi8 (void* base_addr, __mmask16 k, __m256i a)

Synopsis

void _mm256_mask_cvtsepi16_storeu_epi8 (void* base_addr, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 15 i := 16*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_Int16_To_Int8(a[i+15:i]) FI ENDFOR dst[MAX:128] := 0
vpmovswb
void _mm512_mask_cvtsepi16_storeu_epi8 (void* base_addr, __mmask32 k, __m512i a)

Synopsis

void _mm512_mask_cvtsepi16_storeu_epi8 (void* base_addr, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 31 i := 16*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_Int16_To_Int8(a[i+15:i]) FI ENDFOR dst[MAX:256] := 0
vpmovsdw
__m128i _mm_cvtsepi32_epi16 (__m128i a)

Synopsis

__m128i _mm_cvtsepi32_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 16*j dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:64] := 0
vpmovsdw
__m128i _mm_mask_cvtsepi32_epi16 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtsepi32_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:64] := 0
vpmovsdw
__m128i _mm_maskz_cvtsepi32_epi16 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtsepi32_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovsdw
__m128i _mm256_cvtsepi32_epi16 (__m256i a)

Synopsis

__m128i _mm256_cvtsepi32_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 16*j dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:128] := 0
vpmovsdw
__m128i _mm256_mask_cvtsepi32_epi16 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtsepi32_epi16 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
vpmovsdw
__m128i _mm256_maskz_cvtsepi32_epi16 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtsepi32_epi16 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovsdw
__m256i _mm512_cvtsepi32_epi16 (__m512i a)

Synopsis

__m256i _mm512_cvtsepi32_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpmovsdw ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j k := 16*j dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:256] := 0
vpmovsdw
__m256i _mm512_mask_cvtsepi32_epi16 (__m256i src, __mmask16 k, __m512i a)

Synopsis

__m256i _mm512_mask_cvtsepi32_epi16 (__m256i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsdw ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:256] := 0
vpmovsdw
__m256i _mm512_maskz_cvtsepi32_epi16 (__mmask16 k, __m512i a)

Synopsis

__m256i _mm512_maskz_cvtsepi32_epi16 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsdw ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovsdb
__m128i _mm_cvtsepi32_epi8 (__m128i a)

Synopsis

__m128i _mm_cvtsepi32_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 8*j dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:32] := 0
vpmovsdb
__m128i _mm_mask_cvtsepi32_epi8 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtsepi32_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:32] := 0
vpmovsdb
__m128i _mm_maskz_cvtsepi32_epi8 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtsepi32_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:32] := 0
vpmovsdb
__m128i _mm256_cvtsepi32_epi8 (__m256i a)

Synopsis

__m128i _mm256_cvtsepi32_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 8*j dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:64] := 0
vpmovsdb
__m128i _mm256_mask_cvtsepi32_epi8 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtsepi32_epi8 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
vpmovsdb
__m128i _mm256_maskz_cvtsepi32_epi8 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtsepi32_epi8 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovsdb
__m128i _mm512_cvtsepi32_epi8 (__m512i a)

Synopsis

__m128i _mm512_cvtsepi32_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovsdb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j k := 8*j dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:128] := 0
vpmovsdb
__m128i _mm512_mask_cvtsepi32_epi8 (__m128i src, __mmask16 k, __m512i a)

Synopsis

__m128i _mm512_mask_cvtsepi32_epi8 (__m128i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsdb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
vpmovsdb
__m128i _mm512_maskz_cvtsepi32_epi8 (__mmask16 k, __m512i a)

Synopsis

__m128i _mm512_maskz_cvtsepi32_epi8 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsdb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovsdw
void _mm_mask_cvtsepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtsepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Saturate_Int32_To_Int16(a[i+31:i]) FI ENDFOR dst[MAX:64] := 0
vpmovsdw
void _mm256_mask_cvtsepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtsepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Saturate_Int32_To_Int16(a[i+31:i]) FI ENDFOR dst[MAX:128] := 0
vpmovsdw
void _mm512_mask_cvtsepi32_storeu_epi16 (void* base_addr, __mmask16 k, __m512i a)

Synopsis

void _mm512_mask_cvtsepi32_storeu_epi16 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsdw m256 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Saturate_Int32_To_Int16(a[i+31:i]) FI ENDFOR
vpmovsdb
void _mm_mask_cvtsepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtsepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_Int32_To_Int8(a[i+31:i]) FI ENDFOR dst[MAX:32] := 0
vpmovsdb
void _mm256_mask_cvtsepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtsepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_Int32_To_Int8(a[i+31:i]) FI ENDFOR dst[MAX:64] := 0
vpmovsdb
void _mm512_mask_cvtsepi32_storeu_epi8 (void* base_addr, __mmask16 k, __m512i a)

Synopsis

void _mm512_mask_cvtsepi32_storeu_epi8 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsdb m128 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_Int32_To_Int8(a[i+31:i]) FI ENDFOR
vpmovsqw
__m128i _mm_cvtsepi64_epi16 (__m128i a)

Synopsis

__m128i _mm_cvtsepi64_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 16*j dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:32] := 0
vpmovsqw
__m128i _mm_mask_cvtsepi64_epi16 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtsepi64_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:32] := 0
vpmovsqw
__m128i _mm_maskz_cvtsepi64_epi16 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtsepi64_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:32] := 0
vpmovsqw
__m128i _mm256_cvtsepi64_epi16 (__m256i a)

Synopsis

__m128i _mm256_cvtsepi64_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 16*j dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:64] := 0
vpmovsqw
__m128i _mm256_mask_cvtsepi64_epi16 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtsepi64_epi16 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:64] := 0
vpmovsqw
__m128i _mm256_maskz_cvtsepi64_epi16 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtsepi64_epi16 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovsqw
__m128i _mm512_cvtsepi64_epi16 (__m512i a)

Synopsis

__m128i _mm512_cvtsepi64_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpmovsqw xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 16*j dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vpmovsqw
__m128i _mm512_mask_cvtsepi64_epi16 (__m128i src, __mmask8 k, __m512i a)

Synopsis

__m128i _mm512_mask_cvtsepi64_epi16 (__m128i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqw xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
vpmovsqw
__m128i _mm512_maskz_cvtsepi64_epi16 (__mmask8 k, __m512i a)

Synopsis

__m128i _mm512_maskz_cvtsepi64_epi16 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqw xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovsqd
__m128i _mm_cvtsepi64_epi32 (__m128i a)

Synopsis

__m128i _mm_cvtsepi64_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 32*j dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:64] := 0
vpmovsqd
__m128i _mm_mask_cvtsepi64_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtsepi64_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:64] := 0
vpmovsqd
__m128i _mm_maskz_cvtsepi64_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtsepi64_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovsqd
__m128i _mm256_cvtsepi64_epi32 (__m256i a)

Synopsis

__m128i _mm256_cvtsepi64_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 32*j dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vpmovsqd
__m128i _mm256_mask_cvtsepi64_epi32 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtsepi64_epi32 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:128] := 0
vpmovsqd
__m128i _mm256_maskz_cvtsepi64_epi32 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtsepi64_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovsqd
__m256i _mm512_cvtsepi64_epi32 (__m512i a)

Synopsis

__m256i _mm512_cvtsepi64_epi32 (__m512i a)
#include "immintrin.h"
Instruction: vpmovsqd ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 32*j dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vpmovsqd
__m256i _mm512_mask_cvtsepi64_epi32 (__m256i src, __mmask8 k, __m512i a)

Synopsis

__m256i _mm512_mask_cvtsepi64_epi32 (__m256i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqd ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:256] := 0
vpmovsqd
__m256i _mm512_maskz_cvtsepi64_epi32 (__mmask8 k, __m512i a)

Synopsis

__m256i _mm512_maskz_cvtsepi64_epi32 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqd ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovsqb
__m128i _mm_cvtsepi64_epi8 (__m128i a)

Synopsis

__m128i _mm_cvtsepi64_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 8*j dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:16] := 0
vpmovsqb
__m128i _mm_mask_cvtsepi64_epi8 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtsepi64_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:16] := 0
vpmovsqb
__m128i _mm_maskz_cvtsepi64_epi8 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtsepi64_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:16] := 0
vpmovsqb
__m128i _mm256_cvtsepi64_epi8 (__m256i a)

Synopsis

__m128i _mm256_cvtsepi64_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 8*j dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:32] := 0
vpmovsqb
__m128i _mm256_mask_cvtsepi64_epi8 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtsepi64_epi8 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:32] := 0
vpmovsqb
__m128i _mm256_maskz_cvtsepi64_epi8 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtsepi64_epi8 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:32] := 0
vpmovsqb
__m128i _mm512_cvtsepi64_epi8 (__m512i a)

Synopsis

__m128i _mm512_cvtsepi64_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovsqb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 8*j dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:64] := 0
vpmovsqb
__m128i _mm512_mask_cvtsepi64_epi8 (__m128i src, __mmask8 k, __m512i a)

Synopsis

__m128i _mm512_mask_cvtsepi64_epi8 (__m128i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
vpmovsqb
__m128i _mm512_maskz_cvtsepi64_epi8 (__mmask8 k, __m512i a)

Synopsis

__m128i _mm512_maskz_cvtsepi64_epi8 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovsqw
void _mm_mask_cvtsepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtsepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Saturate_Int64_To_Int16(a[i+63:i]) FI ENDFOR dst[MAX:32] := 0
vpmovsqw
void _mm256_mask_cvtsepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtsepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Saturate_Int64_To_Int16(a[i+63:i]) FI ENDFOR dst[MAX:64] := 0
vpmovsqw
void _mm512_mask_cvtsepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m512i a)

Synopsis

void _mm512_mask_cvtsepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqw m128 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Saturate_Int64_To_Int16(a[i+63:i]) FI ENDFOR
vpmovsqd
void _mm_mask_cvtsepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtsepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] MEM[base_addr+l+31:base_addr+l] := Saturate_Int64_To_Int32(a[i+63:i]) FI ENDFOR dst[MAX:64] := 0
vpmovsqd
void _mm256_mask_cvtsepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtsepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] MEM[base_addr+l+31:base_addr+l] := Saturate_Int64_To_Int32(a[i+63:i]) FI ENDFOR dst[MAX:128] := 0
vpmovsqd
void _mm512_mask_cvtsepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m512i a)

Synopsis

void _mm512_mask_cvtsepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqd m256 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] MEM[base_addr+l+31:base_addr+l] := Saturate_Int64_To_Int32(a[i+63:i]) FI ENDFOR
vpmovsqb
void _mm_mask_cvtsepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtsepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_Int64_To_Int8(a[i+63:i]) FI ENDFOR dst[MAX:16] := 0
vpmovsqb
void _mm256_mask_cvtsepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtsepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_Int64_To_Int8(a[i+63:i]) FI ENDFOR dst[MAX:32] := 0
vpmovsqb
void _mm512_mask_cvtsepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m512i a)

Synopsis

void _mm512_mask_cvtsepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqb m64 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_Int64_To_Int8(a[i+63:i]) FI ENDFOR
...
float _cvtsh_ss (unsigned short a)

Synopsis

float _cvtsh_ss (unsigned short a)
#include "emmintrin.h"

Description

Convert the half-precision (16-bit) floating-point value a to a single-precision (32-bit) floating-point value, and store the result in dst.

Operation

dst[31:0] := Convert_FP16_To_FP32(a[15:0])
movd
int _mm_cvtsi128_si32 (__m128i a)

Synopsis

int _mm_cvtsi128_si32 (__m128i a)
#include "emmintrin.h"
Instruction: movd r32, xmm
CPUID Flags: SSE2

Description

Copy the lower 32-bit integer in a to dst.

Operation

dst[31:0] := a[31:0]
movq
__int64 _mm_cvtsi128_si64 (__m128i a)

Synopsis

__int64 _mm_cvtsi128_si64 (__m128i a)
#include "emmintrin.h"
Instruction: movq r64, xmm
CPUID Flags: SSE2

Description

Copy the lower 64-bit integer in a to dst.

Operation

dst[63:0] := a[63:0]

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
Westmere2-
Nehalem2-
movq
__int64 _mm_cvtsi128_si64x (__m128i a)

Synopsis

__int64 _mm_cvtsi128_si64x (__m128i a)
#include "emmintrin.h"
Instruction: movq r64, xmm
CPUID Flags: SSE2

Description

Copy the lower 64-bit integer in a to dst.

Operation

dst[63:0] := a[63:0]

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
Westmere2-
Nehalem2-
cvtsi2sd
__m128d _mm_cvtsi32_sd (__m128d a, int b)

Synopsis

__m128d _mm_cvtsi32_sd (__m128d a, int b)
#include "emmintrin.h"
Instruction: cvtsi2sd xmm, r32
CPUID Flags: SSE2

Description

Convert the 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := Convert_Int32_To_FP64(b[31:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Ivy Bridge41
Sandy Bridge41
movd
__m128i _mm_cvtsi32_si128 (int a)

Synopsis

__m128i _mm_cvtsi32_si128 (int a)
#include "emmintrin.h"
Instruction: movd xmm, r32
CPUID Flags: SSE2

Description

Copy 32-bit integer a to the lower elements of dst, and zero the upper elements of dst.

Operation

dst[31:0] := a[31:0] dst[127:32] := 0
cvtsi2ss
__m128 _mm_cvtsi32_ss (__m128 a, int b)

Synopsis

__m128 _mm_cvtsi32_ss (__m128 a, int b)
#include "xmmintrin.h"
Instruction: cvtsi2ss xmm, r32
CPUID Flags: SSE

Description

Convert the 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := Convert_Int32_To_FP32(b[31:0]) dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge51
Sandy Bridge51
Westmere31
Nehalem31
cvtsi2sd
__m128d _mm_cvtsi64_sd (__m128d a, __int64 b)

Synopsis

__m128d _mm_cvtsi64_sd (__m128d a, __int64 b)
#include "emmintrin.h"
Instruction: cvtsi2sd xmm, r64
CPUID Flags: SSE2

Description

Convert the 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := Convert_Int64_To_FP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell40.8
Ivy Bridge41
Sandy Bridge41
Westmere31
Nehalem31
movq
__m128i _mm_cvtsi64_si128 (__int64 a)

Synopsis

__m128i _mm_cvtsi64_si128 (__int64 a)
#include "emmintrin.h"
Instruction: movq xmm, r64
CPUID Flags: SSE2

Description

Copy 64-bit integer a to the lower element of dst, and zero the upper element.

Operation

dst[63:0] := a[63:0] dst[127:64] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
cvtsi2ss
__m128 _mm_cvtsi64_ss (__m128 a, __int64 b)

Synopsis

__m128 _mm_cvtsi64_ss (__m128 a, __int64 b)
#include "xmmintrin.h"
Instruction: cvtsi2ss xmm, r64
CPUID Flags: SSE

Description

Convert the 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := Convert_Int64_To_FP32(b[63:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell5-
Ivy Bridge5-
Sandy Bridge5-
Westmere3-
Nehalem3-
cvtsi2sd
__m128d _mm_cvtsi64x_sd (__m128d a, __int64 b)

Synopsis

__m128d _mm_cvtsi64x_sd (__m128d a, __int64 b)
#include "emmintrin.h"
Instruction: cvtsi2sd xmm, r64
CPUID Flags: SSE2

Description

Convert the 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := Convert_Int64_To_FP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell40.8
Ivy Bridge41
Sandy Bridge41
Westmere31
Nehalem31
movq
__m128i _mm_cvtsi64x_si128 (__int64 a)

Synopsis

__m128i _mm_cvtsi64x_si128 (__int64 a)
#include "emmintrin.h"
Instruction: movq xmm, r64
CPUID Flags: SSE2

Description

Copy 64-bit integer a to the lower element of dst, and zero the upper element.

Operation

dst[63:0] := a[63:0] dst[127:64] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
movss
float _mm_cvtss_f32 (__m128 a)

Synopsis

float _mm_cvtss_f32 (__m128 a)
#include "xmmintrin.h"
Instruction: movss m32, xmm
CPUID Flags: SSE

Description

Copy the lower single-precision (32-bit) floating-point element of a to dst.

Operation

dst[31:0] := a[31:0]
vcvtss2si
int _mm_cvtss_i32 (__m128 a)

Synopsis

int _mm_cvtss_i32 (__m128 a)
#include "immintrin.h"
Instruction: vcvtss2si r32, xmm
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.

Operation

dst[31:0] := Convert_FP32_To_Int32(a[31:0])
vcvtss2si
__int64 _mm_cvtss_i64 (__m128 a)

Synopsis

__int64 _mm_cvtss_i64 (__m128 a)
#include "immintrin.h"
Instruction: vcvtss2si r64, xmm
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.

Operation

dst[63:0] := Convert_FP32_To_Int64(a[31:0])
cvtss2sd
__m128d _mm_cvtss_sd (__m128d a, __m128 b)

Synopsis

__m128d _mm_cvtss_sd (__m128d a, __m128 b)
#include "emmintrin.h"
Instruction: cvtss2sd xmm, xmm
CPUID Flags: SSE2

Description

Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := Convert_FP32_To_FP64(b[31:0]) dst[127:64] := a[127:64] dst[MAX:64] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.8
Ivy Bridge11
Sandy Bridge11
Westmere01
Nehalem01
vcvtss2sd
__m128d _mm_mask_cvtss_sd (__m128d src, __mmask8 k, __m128d a, __m128 b)

Synopsis

__m128d _mm_mask_cvtss_sd (__m128d src, __mmask8 k, __m128d a, __m128 b)
#include "immintrin.h"
Instruction: vcvtss2sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := Convert_FP32_To_FP64(b[31:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:64] := 0
vcvtss2sd
__m128d _mm_maskz_cvtss_sd (__mmask8 k, __m128d a, __m128 b)

Synopsis

__m128d _mm_maskz_cvtss_sd (__mmask8 k, __m128d a, __m128 b)
#include "immintrin.h"
Instruction: vcvtss2sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := Convert_FP32_To_FP64(b[31:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:64] := 0
...
unsigned short _cvtss_sh (float a, int imm8)

Synopsis

unsigned short _cvtss_sh (float a, int imm8)
#include "emmintrin.h"

Description

Convert the single-precision (32-bit) floating-point value a to a half-precision (16-bit) floating-point value, and store the result in dst.

Operation

dst[15:0] := Convert_FP32_To_FP16(a[31:0])
cvtss2si
int _mm_cvtss_si32 (__m128 a)

Synopsis

int _mm_cvtss_si32 (__m128 a)
#include "xmmintrin.h"
Instruction: cvtss2si r32, xmm
CPUID Flags: SSE

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.

Operation

dst[31:0] := Convert_FP32_To_Int32(a[31:0])
cvtss2si
__int64 _mm_cvtss_si64 (__m128 a)

Synopsis

__int64 _mm_cvtss_si64 (__m128 a)
#include "xmmintrin.h"
Instruction: cvtss2si r64, xmm
CPUID Flags: SSE

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.

Operation

dst[63:0] := Convert_FP32_To_Int64(a[31:0])

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge51
Sandy Bridge51
Westmere41
Nehalem41
vcvtss2usi
unsigned int _mm_cvtss_u32 (__m128 a)

Synopsis

unsigned int _mm_cvtss_u32 (__m128 a)
#include "immintrin.h"
Instruction: vcvtss2usi r32, xmm
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.

Operation

dst[31:0] := Convert_FP32_To_UnsignedInt32(a[31:0])
vcvtss2usi
unsigned __int64 _mm_cvtss_u64 (__m128 a)

Synopsis

unsigned __int64 _mm_cvtss_u64 (__m128 a)
#include "immintrin.h"
Instruction: vcvtss2usi r64, xmm
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.

Operation

dst[63:0] := Convert_FP32_To_UnsignedInt64(a[31:0])
cvttps2pi
__m64 _mm_cvtt_ps2pi (__m128 a)

Synopsis

__m64 _mm_cvtt_ps2pi (__m128 a)
#include "xmmintrin.h"
Instruction: cvttps2pi mm, xmm
CPUID Flags: SSE

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcvttpd2dq
__m256i _mm512_cvtt_roundpd_epi32 (__m512d a, int sae)

Synopsis

__m256i _mm512_cvtt_roundpd_epi32 (__m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2dq ymm {k}, zmm {sae}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := 32*i k := 64*j dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[k+63:k]) ENDFOR dst[MAX:256] := 0
vcvttpd2dq
__m256i _mm512_mask_cvtt_roundpd_epi32 (__m256i src, __mmask8 k, __m512d a, int sae)

Synopsis

__m256i _mm512_mask_cvtt_roundpd_epi32 (__m256i src, __mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2dq ymm {k}, zmm {sae}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := 32*i l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvttpd2dq
__m256i _mm512_maskz_cvtt_roundpd_epi32 (__mmask8 k, __m512d a, int sae)

Synopsis

__m256i _mm512_maskz_cvtt_roundpd_epi32 (__mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2dq ymm {k}, zmm {sae}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := 32*i l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvttpd2qq
__m512i _mm512_cvtt_roundpd_epi64 (__m512d a, int sae)

Synopsis

__m512i _mm512_cvtt_roundpd_epi64 (__m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvttpd2qq
__m512i _mm512_mask_cvtt_roundpd_epi64 (__m512i src, __mmask8 k, __m512d a, int sae)

Synopsis

__m512i _mm512_mask_cvtt_roundpd_epi64 (__m512i src, __mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvttpd2qq
__m512i _mm512_maskz_cvtt_roundpd_epi64 (__mmask8 k, __m512d a, int sae)

Synopsis

__m512i _mm512_maskz_cvtt_roundpd_epi64 (__mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvttpd2udq
__m256i _mm512_cvtt_roundpd_epu32 (__m512d a, int sae)

Synopsis

__m256i _mm512_cvtt_roundpd_epu32 (__m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2udq ymm {k}, zmm {sae}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := 32*i k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[k+63:k]) ENDFOR dst[MAX:256] := 0
vcvttpd2udq
__m256i _mm512_mask_cvtt_roundpd_epu32 (__m256i src, __mmask8 k, __m512d a, int sae)

Synopsis

__m256i _mm512_mask_cvtt_roundpd_epu32 (__m256i src, __mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2udq ymm {k}, zmm {sae}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := 32*i l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvttpd2udq
__m256i _mm512_maskz_cvtt_roundpd_epu32 (__mmask8 k, __m512d a, int sae)

Synopsis

__m256i _mm512_maskz_cvtt_roundpd_epu32 (__mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2udq ymm {k}, zmm {sae}
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := 32*i l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvttpd2uqq
__m512i _mm512_cvtt_roundpd_epu64 (__m512d a, int sae)

Synopsis

__m512i _mm512_cvtt_roundpd_epu64 (__m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvttpd2uqq
__m512i _mm512_mask_cvtt_roundpd_epu64 (__m512i src, __mmask8 k, __m512d a, int sae)

Synopsis

__m512i _mm512_mask_cvtt_roundpd_epu64 (__m512i src, __mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvttpd2uqq
__m512i _mm512_maskz_cvtt_roundpd_epu64 (__mmask8 k, __m512d a, int sae)

Synopsis

__m512i _mm512_maskz_cvtt_roundpd_epu64 (__mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvttps2dq
__m512i _mm512_cvtt_roundps_epi32 (__m512 a, int sae)

Synopsis

__m512i _mm512_cvtt_roundps_epi32 (__m512 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2dq zmm {k}, zmm {sae}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := 32*i dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvttps2dq
__m512i _mm512_mask_cvtt_roundps_epi32 (__m512i src, __mmask16 k, __m512 a, int sae)

Synopsis

__m512i _mm512_mask_cvtt_roundps_epi32 (__m512i src, __mmask16 k, __m512 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2dq zmm {k}, zmm {sae}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := 32*i IF k[j] dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvttps2dq
__m512i _mm512_maskz_cvtt_roundps_epi32 (__mmask16 k, __m512 a, int sae)

Synopsis

__m512i _mm512_maskz_cvtt_roundps_epi32 (__mmask16 k, __m512 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2dq zmm {k}, zmm {sae}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := 32*i IF k[j] dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvttps2qq
__m512i _mm512_cvtt_roundps_epi64 (__m256 a, int sae)

Synopsis

__m512i _mm512_cvtt_roundps_epi64 (__m256 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ENDFOR dst[MAX:512] := 0
vcvttps2qq
__m512i _mm512_mask_cvtt_roundps_epi64 (__m512i src, __mmask8 k, __m256 a, int sae)

Synopsis

__m512i _mm512_mask_cvtt_roundps_epi64 (__m512i src, __mmask8 k, __m256 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvttps2qq
__m512i _mm512_maskz_cvtt_roundps_epi64 (__mmask8 k, __m256 a, int sae)

Synopsis

__m512i _mm512_maskz_cvtt_roundps_epi64 (__mmask8 k, __m256 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvttps2udq
__m512i _mm512_cvtt_roundps_epu32 (__m512 a, int sae)

Synopsis

__m512i _mm512_cvtt_roundps_epu32 (__m512 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2udq zmm {k}, zmm {sae}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := 32*i dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvttps2udq
__m512i _mm512_mask_cvtt_roundps_epu32 (__m512i src, __mmask16 k, __m512 a, int sae)

Synopsis

__m512i _mm512_mask_cvtt_roundps_epu32 (__m512i src, __mmask16 k, __m512 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2udq zmm {k}, zmm {sae}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := 32*i IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvttps2udq
__m512i _mm512_maskz_cvtt_roundps_epu32 (__mmask16 k, __m512 a, int sae)

Synopsis

__m512i _mm512_maskz_cvtt_roundps_epu32 (__mmask16 k, __m512 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2udq zmm {k}, zmm {sae}
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := 32*i IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvttps2uqq
__m512i _mm512_cvtt_roundps_epu64 (__m256 a, int sae)

Synopsis

__m512i _mm512_cvtt_roundps_epu64 (__m256 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l]) ENDFOR dst[MAX:512] := 0
vcvttps2uqq
__m512i _mm512_mask_cvtt_roundps_epu64 (__m512i src, __mmask8 k, __m256 a, int sae)

Synopsis

__m512i _mm512_mask_cvtt_roundps_epu64 (__m512i src, __mmask8 k, __m256 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvttps2uqq
__m512i _mm512_maskz_cvtt_roundps_epu64 (__mmask8 k, __m256 a, int sae)

Synopsis

__m512i _mm512_maskz_cvtt_roundps_epu64 (__mmask8 k, __m256 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvttsd2si
int _mm_cvtt_roundsd_i32 (__m128d a, int rounding)

Synopsis

int _mm_cvtt_roundsd_i32 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvttsd2si r32, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
vcvttsd2si
__int64 _mm_cvtt_roundsd_i64 (__m128d a, int rounding)

Synopsis

__int64 _mm_cvtt_roundsd_i64 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvttsd2si r64, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
vcvttsd2si
int _mm_cvtt_roundsd_si32 (__m128d a, int rounding)

Synopsis

int _mm_cvtt_roundsd_si32 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvttsd2si r32, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
vcvttsd2si
__int64 _mm_cvtt_roundsd_si64 (__m128d a, int rounding)

Synopsis

__int64 _mm_cvtt_roundsd_si64 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvttsd2si r64, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
vcvttsd2usi
unsigned int _mm_cvtt_roundsd_u32 (__m128d a, int rounding)

Synopsis

unsigned int _mm_cvtt_roundsd_u32 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvttsd2usi r32, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP64_To_UnsignedInt32_Truncate(a[63:0])
vcvttsd2usi
unsigned __int64 _mm_cvtt_roundsd_u64 (__m128d a, int rounding)

Synopsis

unsigned __int64 _mm_cvtt_roundsd_u64 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvttsd2usi r64, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP64_To_UnsignedInt64_Truncate(a[63:0])
vcvttss2si
int _mm_cvtt_roundss_i32 (__m128 a, int rounding)

Synopsis

int _mm_cvtt_roundss_i32 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvttss2si r32, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
vcvttss2si
__int64 _mm_cvtt_roundss_i64 (__m128 a, int rounding)

Synopsis

__int64 _mm_cvtt_roundss_i64 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvttss2si r64, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
vcvttss2si
int _mm_cvtt_roundss_si32 (__m128 a, int rounding)

Synopsis

int _mm_cvtt_roundss_si32 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvttss2si r32, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
vcvttss2si
__int64 _mm_cvtt_roundss_si64 (__m128 a, int rounding)

Synopsis

__int64 _mm_cvtt_roundss_si64 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvttss2si r64, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
vcvttss2usi
unsigned int _mm_cvtt_roundss_u32 (__m128 a, int rounding)

Synopsis

unsigned int _mm_cvtt_roundss_u32 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvttss2usi r32, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := Convert_FP32_To_UnsignedInt32_Truncate(a[31:0])
vcvttss2usi
unsigned __int64 _mm_cvtt_roundss_u64 (__m128 a, int rounding)

Synopsis

unsigned __int64 _mm_cvtt_roundss_u64 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvttss2usi r64, xmm {er}
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := Convert_FP32_To_UnsignedInt64_Truncate(a[31:0])
cvttss2si
int _mm_cvtt_ss2si (__m128 a)

Synopsis

int _mm_cvtt_ss2si (__m128 a)
#include "xmmintrin.h"
Instruction: cvttss2si r32, xmm
CPUID Flags: SSE

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.

Operation

dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
cvttpd2dq
__m128i _mm_cvttpd_epi32 (__m128d a)

Synopsis

__m128i _mm_cvttpd_epi32 (__m128d a)
#include "emmintrin.h"
Instruction: cvttpd2dq xmm, xmm
CPUID Flags: SSE2

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell40.8
Ivy Bridge41
Sandy Bridge41
Westmere41
Nehalem41
vcvttpd2dq
__m128i _mm_mask_cvttpd_epi32 (__m128i src, __mmask8 k, __m128d a)

Synopsis

__m128i _mm_mask_cvttpd_epi32 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:64] := 0
vcvttpd2dq
__m128i _mm_maskz_cvttpd_epi32 (__mmask8 k, __m128d a)

Synopsis

__m128i _mm_maskz_cvttpd_epi32 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:64] := 0
vcvttpd2dq
__m128i _mm256_cvttpd_epi32 (__m256d a)

Synopsis

__m128i _mm256_cvttpd_epi32 (__m256d a)
#include "immintrin.h"
Instruction: vcvttpd2dq xmm, ymm
CPUID Flags: AVX

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell41
Ivy Bridge41
Sandy Bridge41
vcvttpd2dq
__m128i _mm256_mask_cvttpd_epi32 (__m128i src, __mmask8 k, __m256d a)

Synopsis

__m128i _mm256_mask_cvttpd_epi32 (__m128i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vcvttpd2dq
__m128i _mm256_maskz_cvttpd_epi32 (__mmask8 k, __m256d a)

Synopsis

__m128i _mm256_maskz_cvttpd_epi32 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvttpd2dq
__m256i _mm512_cvttpd_epi32 (__m512d a)

Synopsis

__m256i _mm512_cvttpd_epi32 (__m512d a)
#include "immintrin.h"
Instruction: vcvttpd2dq ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) ENDFOR dst[MAX:256] := 0
vcvttpd2dq
__m256i _mm512_mask_cvttpd_epi32 (__m256i src, __mmask8 k, __m512d a)

Synopsis

__m256i _mm512_mask_cvttpd_epi32 (__m256i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2dq ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvttpd2dq
__m256i _mm512_maskz_cvttpd_epi32 (__mmask8 k, __m512d a)

Synopsis

__m256i _mm512_maskz_cvttpd_epi32 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2dq ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvttpd2qq
__m128i _mm_cvttpd_epi64 (__m128d a)

Synopsis

__m128i _mm_cvttpd_epi64 (__m128d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vcvttpd2qq
__m128i _mm_mask_cvttpd_epi64 (__m128i src, __mmask8 k, __m128d a)

Synopsis

__m128i _mm_mask_cvttpd_epi64 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vcvttpd2qq
__m128i _mm_maskz_cvttpd_epi64 (__mmask8 k, __m128d a)

Synopsis

__m128i _mm_maskz_cvttpd_epi64 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvttpd2qq
__m256i _mm256_cvttpd_epi64 (__m256d a)

Synopsis

__m256i _mm256_cvttpd_epi64 (__m256d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vcvttpd2qq
__m256i _mm256_mask_cvttpd_epi64 (__m256i src, __mmask8 k, __m256d a)

Synopsis

__m256i _mm256_mask_cvttpd_epi64 (__m256i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vcvttpd2qq
__m256i _mm256_maskz_cvttpd_epi64 (__mmask8 k, __m256d a)

Synopsis

__m256i _mm256_maskz_cvttpd_epi64 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvttpd2qq
__m512i _mm512_cvttpd_epi64 (__m512d a)

Synopsis

__m512i _mm512_cvttpd_epi64 (__m512d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvttpd2qq
__m512i _mm512_mask_cvttpd_epi64 (__m512i src, __mmask8 k, __m512d a)

Synopsis

__m512i _mm512_mask_cvttpd_epi64 (__m512i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvttpd2qq
__m512i _mm512_maskz_cvttpd_epi64 (__mmask8 k, __m512d a)

Synopsis

__m512i _mm512_maskz_cvttpd_epi64 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvttpd2udq
__m128i _mm_cvttpd_epu32 (__m128d a)

Synopsis

__m128i _mm_cvttpd_epu32 (__m128d a)
#include "immintrin.h"
Instruction: vcvttpd2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k]) ENDFOR dst[MAX:64] := 0
vcvttpd2udq
__m128i _mm_mask_cvttpd_epu32 (__m128i src, __mmask8 k, __m128d a)

Synopsis

__m128i _mm_mask_cvttpd_epu32 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:64] := 0
vcvttpd2udq
__m128i _mm_maskz_cvttpd_epu32 (__mmask8 k, __m128d a)

Synopsis

__m128i _mm_maskz_cvttpd_epu32 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:64] := 0
vcvttpd2udq
__m128i _mm256_cvttpd_epu32 (__m256d a)

Synopsis

__m128i _mm256_cvttpd_epu32 (__m256d a)
#include "immintrin.h"
Instruction: vcvttpd2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k]) ENDFOR dst[MAX:128] := 0
vcvttpd2udq
__m128i _mm256_mask_cvttpd_epu32 (__m128i src, __mmask8 k, __m256d a)

Synopsis

__m128i _mm256_mask_cvttpd_epu32 (__m128i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vcvttpd2udq
__m128i _mm256_maskz_cvttpd_epu32 (__mmask8 k, __m256d a)

Synopsis

__m128i _mm256_maskz_cvttpd_epu32 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvttpd2udq
__m256i _mm512_cvttpd_epu32 (__m512d a)

Synopsis

__m256i _mm512_cvttpd_epu32 (__m512d a)
#include "immintrin.h"
Instruction: vcvttpd2udq ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k]) ENDFOR dst[MAX:256] := 0
vcvttpd2udq
__m256i _mm512_mask_cvttpd_epu32 (__m256i src, __mmask8 k, __m512d a)

Synopsis

__m256i _mm512_mask_cvttpd_epu32 (__m256i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2udq ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvttpd2udq
__m256i _mm512_maskz_cvttpd_epu32 (__mmask8 k, __m512d a)

Synopsis

__m256i _mm512_maskz_cvttpd_epu32 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2udq ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvttpd2uqq
__m128i _mm_cvttpd_epu64 (__m128d a)

Synopsis

__m128i _mm_cvttpd_epu64 (__m128d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vcvttpd2uqq
__m128i _mm_mask_cvttpd_epu64 (__m128i src, __mmask8 k, __m128d a)

Synopsis

__m128i _mm_mask_cvttpd_epu64 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vcvttpd2uqq
__m128i _mm_maskz_cvttpd_epu64 (__mmask8 k, __m128d a)

Synopsis

__m128i _mm_maskz_cvttpd_epu64 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvttpd2uqq
__m256i _mm256_cvttpd_epu64 (__m256d a)

Synopsis

__m256i _mm256_cvttpd_epu64 (__m256d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vcvttpd2uqq
__m256i _mm256_mask_cvttpd_epu64 (__m256i src, __mmask8 k, __m256d a)

Synopsis

__m256i _mm256_mask_cvttpd_epu64 (__m256i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vcvttpd2uqq
__m256i _mm256_maskz_cvttpd_epu64 (__mmask8 k, __m256d a)

Synopsis

__m256i _mm256_maskz_cvttpd_epu64 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvttpd2uqq
__m512i _mm512_cvttpd_epu64 (__m512d a)

Synopsis

__m512i _mm512_cvttpd_epu64 (__m512d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vcvttpd2uqq
__m512i _mm512_mask_cvttpd_epu64 (__m512i src, __mmask8 k, __m512d a)

Synopsis

__m512i _mm512_mask_cvttpd_epu64 (__m512i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvttpd2uqq
__m512i _mm512_maskz_cvttpd_epu64 (__mmask8 k, __m512d a)

Synopsis

__m512i _mm512_maskz_cvttpd_epu64 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512DQ

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
cvttpd2pi
__m64 _mm_cvttpd_pi32 (__m128d a)

Synopsis

__m64 _mm_cvttpd_pi32 (__m128d a)
#include "emmintrin.h"
Instruction: cvttpd2pi mm, xmm
CPUID Flags: SSE2

Description

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell41
Ivy Bridge41
Sandy Bridge41
Westmere41
Nehalem41
cvttps2dq
__m128i _mm_cvttps_epi32 (__m128 a)

Synopsis

__m128i _mm_cvttps_epi32 (__m128 a)
#include "emmintrin.h"
Instruction: cvttps2dq xmm, xmm
CPUID Flags: SSE2

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcvttps2dq
__m128i _mm_mask_cvttps_epi32 (__m128i src, __mmask8 k, __m128 a)

Synopsis

__m128i _mm_mask_cvttps_epi32 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vcvttps2dq
__m128i _mm_maskz_cvttps_epi32 (__mmask8 k, __m128 a)

Synopsis

__m128i _mm_maskz_cvttps_epi32 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*i IF k[j] dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvttps2dq
__m256i _mm256_cvttps_epi32 (__m256 a)

Synopsis

__m256i _mm256_cvttps_epi32 (__m256 a)
#include "immintrin.h"
Instruction: vcvttps2dq ymm, ymm
CPUID Flags: AVX

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vcvttps2dq
__m256i _mm256_mask_cvttps_epi32 (__m256i src, __mmask8 k, __m256 a)

Synopsis

__m256i _mm256_mask_cvttps_epi32 (__m256i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvttps2dq
__m256i _mm256_maskz_cvttps_epi32 (__mmask8 k, __m256 a)

Synopsis

__m256i _mm256_maskz_cvttps_epi32 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2dq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*i IF k[j] dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvttps2dq
__m512i _mm512_cvttps_epi32 (__m512 a)

Synopsis

__m512i _mm512_cvttps_epi32 (__m512 a)
#include "immintrin.h"
Instruction: vcvttps2dq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvttps2dq
__m512i _mm512_mask_cvttps_epi32 (__m512i src, __mmask16 k, __m512 a)

Synopsis

__m512i _mm512_mask_cvttps_epi32 (__m512i src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvttps2dq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvttps2dq
__m512i _mm512_maskz_cvttps_epi32 (__mmask16 k, __m512 a)

Synopsis

__m512i _mm512_maskz_cvttps_epi32 (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvttps2dq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvttps2qq
__m128i _mm_cvttps_epi64 (__m128 a)

Synopsis

__m128i _mm_cvttps_epi64 (__m128 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ENDFOR dst[MAX:128] := 0
vcvttps2qq
__m128i _mm_mask_cvttps_epi64 (__m128i src, __mmask8 k, __m128 a)

Synopsis

__m128i _mm_mask_cvttps_epi64 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vcvttps2qq
__m128i _mm_maskz_cvttps_epi64 (__mmask8 k, __m128 a)

Synopsis

__m128i _mm_maskz_cvttps_epi64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvttps2qq
__m256i _mm256_cvttps_epi64 (__m128 a)

Synopsis

__m256i _mm256_cvttps_epi64 (__m128 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ENDFOR dst[MAX:256] := 0
vcvttps2qq
__m256i _mm256_mask_cvttps_epi64 (__m256i src, __mmask8 k, __m128 a)

Synopsis

__m256i _mm256_mask_cvttps_epi64 (__m256i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vcvttps2qq
__m256i _mm256_maskz_cvttps_epi64 (__mmask8 k, __m128 a)

Synopsis

__m256i _mm256_maskz_cvttps_epi64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvttps2qq
__m512i _mm512_cvttps_epi64 (__m256 a)

Synopsis

__m512i _mm512_cvttps_epi64 (__m256 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ENDFOR dst[MAX:512] := 0
vcvttps2qq
__m512i _mm512_mask_cvttps_epi64 (__m512i src, __mmask8 k, __m256 a)

Synopsis

__m512i _mm512_mask_cvttps_epi64 (__m512i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvttps2qq
__m512i _mm512_maskz_cvttps_epi64 (__mmask8 k, __m256 a)

Synopsis

__m512i _mm512_maskz_cvttps_epi64 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvttps2udq
__m128i _mm_cvttps_epu32 (__m128 a)

Synopsis

__m128i _mm_cvttps_epu32 (__m128 a)
#include "immintrin.h"
Instruction: vcvttps2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i]) ENDFOR dst[MAX:128] := 0
vcvttps2udq
__m128i _mm_mask_cvttps_epu32 (__m128i src, __mmask8 k, __m128 a)

Synopsis

__m128i _mm_mask_cvttps_epu32 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vcvttps2udq
__m128i _mm_maskz_cvttps_epu32 (__mmask8 k, __m128 a)

Synopsis

__m128i _mm_maskz_cvttps_epu32 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvttps2udq
__m256i _mm256_cvttps_epu32 (__m256 a)

Synopsis

__m256i _mm256_cvttps_epu32 (__m256 a)
#include "immintrin.h"
Instruction: vcvttps2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i]) ENDFOR dst[MAX:256] := 0
vcvttps2udq
__m256i _mm256_mask_cvttps_epu32 (__m256i src, __mmask8 k, __m256 a)

Synopsis

__m256i _mm256_mask_cvttps_epu32 (__m256i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vcvttps2udq
__m256i _mm256_maskz_cvttps_epu32 (__mmask8 k, __m256 a)

Synopsis

__m256i _mm256_maskz_cvttps_epu32 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2udq
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvttps2udq
__m512i _mm512_cvttps_epu32 (__m512 a)

Synopsis

__m512i _mm512_cvttps_epu32 (__m512 a)
#include "immintrin.h"
Instruction: vcvttps2udq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vcvttps2udq
__m512i _mm512_mask_cvttps_epu32 (__m512i src, __mmask16 k, __m512 a)

Synopsis

__m512i _mm512_mask_cvttps_epu32 (__m512i src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvttps2udq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vcvttps2udq
__m512i _mm512_maskz_cvttps_epu32 (__mmask16 k, __m512 a)

Synopsis

__m512i _mm512_maskz_cvttps_epu32 (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvttps2udq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vcvttps2uqq
__m128i _mm_cvttps_epu64 (__m128 a)

Synopsis

__m128i _mm_cvttps_epu64 (__m128 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l]) ENDFOR dst[MAX:128] := 0
vcvttps2uqq
__m128i _mm_mask_cvttps_epu64 (__m128i src, __mmask8 k, __m128 a)

Synopsis

__m128i _mm_mask_cvttps_epu64 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vcvttps2uqq
__m128i _mm_maskz_cvttps_epu64 (__mmask8 k, __m128 a)

Synopsis

__m128i _mm_maskz_cvttps_epu64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vcvttps2uqq
__m256i _mm256_cvttps_epu64 (__m128 a)

Synopsis

__m256i _mm256_cvttps_epu64 (__m128 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l]) ENDFOR dst[MAX:256] := 0
vcvttps2uqq
__m256i _mm256_mask_cvttps_epu64 (__m256i src, __mmask8 k, __m128 a)

Synopsis

__m256i _mm256_mask_cvttps_epu64 (__m256i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vcvttps2uqq
__m256i _mm256_maskz_cvttps_epu64 (__mmask8 k, __m128 a)

Synopsis

__m256i _mm256_maskz_cvttps_epu64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512VL + AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vcvttps2uqq
__m512i _mm512_cvttps_epu64 (__m256 a)

Synopsis

__m512i _mm512_cvttps_epu64 (__m256 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l]) ENDFOR dst[MAX:512] := 0
vcvttps2uqq
__m512i _mm512_mask_cvttps_epu64 (__m512i src, __mmask8 k, __m256 a)

Synopsis

__m512i _mm512_mask_cvttps_epu64 (__m512i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vcvttps2uqq
__m512i _mm512_maskz_cvttps_epu64 (__mmask8 k, __m256 a)

Synopsis

__m512i _mm512_maskz_cvttps_epu64 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512DQ

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
cvttps2pi
__m64 _mm_cvttps_pi32 (__m128 a)

Synopsis

__m64 _mm_cvttps_pi32 (__m128 a)
#include "xmmintrin.h"
Instruction: cvttps2pi mm, xmm
CPUID Flags: SSE

Description

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vcvttsd2si
int _mm_cvttsd_i32 (__m128d a)

Synopsis

int _mm_cvttsd_i32 (__m128d a)
#include "immintrin.h"
Instruction: vcvttsd2si r32, xmm
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.

Operation

dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
vcvttsd2si
__int64 _mm_cvttsd_i64 (__m128d a)

Synopsis

__int64 _mm_cvttsd_i64 (__m128d a)
#include "immintrin.h"
Instruction: vcvttsd2si r64, xmm
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.

Operation

dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
cvttsd2si
int _mm_cvttsd_si32 (__m128d a)

Synopsis

int _mm_cvttsd_si32 (__m128d a)
#include "emmintrin.h"
Instruction: cvttsd2si r32, xmm
CPUID Flags: SSE2

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.

Operation

dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
cvttsd2si
__int64 _mm_cvttsd_si64 (__m128d a)

Synopsis

__int64 _mm_cvttsd_si64 (__m128d a)
#include "emmintrin.h"
Instruction: cvttsd2si r64, xmm
CPUID Flags: SSE2

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.

Operation

dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])

Performance

ArchitectureLatencyThroughput
Haswell50.8
Ivy Bridge51
Sandy Bridge51
Westmere41
Nehalem41
cvttsd2si
__int64 _mm_cvttsd_si64x (__m128d a)

Synopsis

__int64 _mm_cvttsd_si64x (__m128d a)
#include "emmintrin.h"
Instruction: cvttsd2si r64, xmm
CPUID Flags: SSE2

Description

Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.

Operation

dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])

Performance

ArchitectureLatencyThroughput
Haswell50.8
Ivy Bridge51
Sandy Bridge51
Westmere41
Nehalem41
vcvttsd2usi
unsigned int _mm_cvttsd_u32 (__m128d a)

Synopsis

unsigned int _mm_cvttsd_u32 (__m128d a)
#include "immintrin.h"
Instruction: vcvttsd2usi r32, xmm
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.

Operation

dst[31:0] := Convert_FP64_To_UnsignedInt32_Truncate(a[63:0])
vcvttsd2usi
unsigned __int64 _mm_cvttsd_u64 (__m128d a)

Synopsis

unsigned __int64 _mm_cvttsd_u64 (__m128d a)
#include "immintrin.h"
Instruction: vcvttsd2usi r64, xmm
CPUID Flags: AVX512F

Description

Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.

Operation

dst[63:0] := Convert_FP64_To_UnsignedInt64_Truncate(a[63:0])
vcvttss2si
int _mm_cvttss_i32 (__m128 a)

Synopsis

int _mm_cvttss_i32 (__m128 a)
#include "immintrin.h"
Instruction: vcvttss2si r32, xmm
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.

Operation

dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
vcvttss2si
__int64 _mm_cvttss_i64 (__m128 a)

Synopsis

__int64 _mm_cvttss_i64 (__m128 a)
#include "immintrin.h"
Instruction: vcvttss2si r64, xmm
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.

Operation

dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
cvttss2si
int _mm_cvttss_si32 (__m128 a)

Synopsis

int _mm_cvttss_si32 (__m128 a)
#include "xmmintrin.h"
Instruction: cvttss2si r32, xmm
CPUID Flags: SSE

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.

Operation

dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
cvttss2si
__int64 _mm_cvttss_si64 (__m128 a)

Synopsis

__int64 _mm_cvttss_si64 (__m128 a)
#include "xmmintrin.h"
Instruction: cvttss2si r64, xmm
CPUID Flags: SSE

Description

Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.

Operation

dst[63:0] := Convert_FP64_To_Int32_Truncate(a[31:0])

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge51
Sandy Bridge51
Westmere41
Nehalem41
vcvttss2usi
unsigned int _mm_cvttss_u32 (__m128 a)

Synopsis

unsigned int _mm_cvttss_u32 (__m128 a)
#include "immintrin.h"
Instruction: vcvttss2usi r32, xmm
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.

Operation

dst[31:0] := Convert_FP32_To_UnsignedInt32_Truncate(a[31:0])
vcvttss2usi
unsigned __int64 _mm_cvttss_u64 (__m128 a)

Synopsis

unsigned __int64 _mm_cvttss_u64 (__m128 a)
#include "immintrin.h"
Instruction: vcvttss2usi r64, xmm
CPUID Flags: AVX512F

Description

Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.

Operation

dst[63:0] := Convert_FP32_To_UnsignedInt64_Truncate(a[31:0])
vcvtusi2sd
__m128d _mm_cvtu32_sd (__m128d a, unsigned int b)

Synopsis

__m128d _mm_cvtu32_sd (__m128d a, unsigned int b)
#include "immintrin.h"
Instruction: vcvtusi2sd xmm, xmm, r32
CPUID Flags: AVX512F

Description

Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := Convert_UnsignedInt32_To_FP64(b[31:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vcvtusi2ss
__m128 _mm_cvtu32_ss (__m128 a, unsigned int b)

Synopsis

__m128 _mm_cvtu32_ss (__m128 a, unsigned int b)
#include "immintrin.h"
Instruction: vcvtusi2ss xmm, xmm, r32 {er}
CPUID Flags: AVX512F

Description

Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := Convert_UnsignedInt32_To_FP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vcvtusi2sd
__m128d _mm_cvtu64_sd (__m128d a, unsigned __int64 b)

Synopsis

__m128d _mm_cvtu64_sd (__m128d a, unsigned __int64 b)
#include "immintrin.h"
Instruction: vcvtusi2sd xmm, xmm, r64
CPUID Flags: AVX512F

Description

Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := Convert_UnsignedInt64_To_FP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vcvtusi2ss
__m128 _mm_cvtu64_ss (__m128 a, unsigned __int64 b)

Synopsis

__m128 _mm_cvtu64_ss (__m128 a, unsigned __int64 b)
#include "immintrin.h"
Instruction: vcvtusi2ss xmm, xmm, r64
CPUID Flags: AVX512F

Description

Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := Convert_UnsignedInt64_To_FP32(b[63:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vpmovuswb
__m128i _mm_cvtusepi16_epi8 (__m128i a)

Synopsis

__m128i _mm_cvtusepi16_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 16*j l := 8*j dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i]) ENDFOR dst[MAX:64] := 0
vpmovuswb
__m128i _mm_mask_cvtusepi16_epi8 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtusepi16_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
vpmovuswb
__m128i _mm_maskz_cvtusepi16_epi8 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtusepi16_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovuswb
__m128i _mm256_cvtusepi16_epi8 (__m256i a)

Synopsis

__m128i _mm256_cvtusepi16_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := 16*j l := 8*j dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i]) ENDFOR dst[MAX:128] := 0
vpmovuswb
__m128i _mm256_mask_cvtusepi16_epi8 (__m128i src, __mmask16 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtusepi16_epi8 (__m128i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
vpmovuswb
__m128i _mm256_maskz_cvtusepi16_epi8 (__mmask16 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtusepi16_epi8 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovuswb
__m256i _mm512_cvtusepi16_epi8 (__m512i a)

Synopsis

__m256i _mm512_cvtusepi16_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512BW

Description

Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 31 i := 16*j l := 8*j dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i]) ENDFOR dst[MAX:256] := 0
vpmovuswb
__m256i _mm512_mask_cvtusepi16_epi8 (__m256i src, __mmask32 k, __m512i a)

Synopsis

__m256i _mm512_mask_cvtusepi16_epi8 (__m256i src, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512BW

Description

Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:256] := 0
vpmovuswb
__m256i _mm512_maskz_cvtusepi16_epi8 (__mmask32 k, __m512i a)

Synopsis

__m256i _mm512_maskz_cvtusepi16_epi8 (__mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512BW

Description

Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := 16*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovuswb
void _mm_mask_cvtusepi16_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtusepi16_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 16*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i]) FI ENDFOR dst[MAX:64] := 0
vpmovuswb
void _mm256_mask_cvtusepi16_storeu_epi8 (void* base_addr, __mmask16 k, __m256i a)

Synopsis

void _mm256_mask_cvtusepi16_storeu_epi8 (void* base_addr, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 15 i := 16*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i]) FI ENDFOR dst[MAX:128] := 0
vpmovuswb
void _mm512_mask_cvtusepi16_storeu_epi8 (void* base_addr, __mmask32 k, __m512i a)

Synopsis

void _mm512_mask_cvtusepi16_storeu_epi8 (void* base_addr, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512BW

Description

Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 31 i := 16*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i]) FI ENDFOR dst[MAX:256] := 0
vpmovusdw
__m128i _mm_cvtusepi32_epi16 (__m128i a)

Synopsis

__m128i _mm_cvtusepi32_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 16*j dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:64] := 0
vpmovusdw
__m128i _mm_mask_cvtusepi32_epi16 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtusepi32_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:64] := 0
vpmovusdw
__m128i _mm_maskz_cvtusepi32_epi16 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtusepi32_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovusdw
__m128i _mm256_cvtusepi32_epi16 (__m256i a)

Synopsis

__m128i _mm256_cvtusepi32_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 16*j dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:128] := 0
vpmovusdw
__m128i _mm256_mask_cvtusepi32_epi16 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtusepi32_epi16 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
vpmovusdw
__m128i _mm256_maskz_cvtusepi32_epi16 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtusepi32_epi16 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovusdw
__m256i _mm512_cvtusepi32_epi16 (__m512i a)

Synopsis

__m256i _mm512_cvtusepi32_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpmovusdw ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j k := 16*j dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:256] := 0
vpmovusdw
__m256i _mm512_mask_cvtusepi32_epi16 (__m256i src, __mmask16 k, __m512i a)

Synopsis

__m256i _mm512_mask_cvtusepi32_epi16 (__m256i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusdw ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:256] := 0
vpmovusdw
__m256i _mm512_maskz_cvtusepi32_epi16 (__mmask16 k, __m512i a)

Synopsis

__m256i _mm512_maskz_cvtusepi32_epi16 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusdw ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovusdb
__m128i _mm_cvtusepi32_epi8 (__m128i a)

Synopsis

__m128i _mm_cvtusepi32_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j k := 8*j dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:32] := 0
vpmovusdb
__m128i _mm_mask_cvtusepi32_epi8 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtusepi32_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:32] := 0
vpmovusdb
__m128i _mm_maskz_cvtusepi32_epi8 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtusepi32_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:32] := 0
vpmovusdb
__m128i _mm256_cvtusepi32_epi8 (__m256i a)

Synopsis

__m128i _mm256_cvtusepi32_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j k := 8*j dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:64] := 0
vpmovusdb
__m128i _mm256_mask_cvtusepi32_epi8 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtusepi32_epi8 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
vpmovusdb
__m128i _mm256_maskz_cvtusepi32_epi8 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtusepi32_epi8 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovusdb
__m128i _mm512_cvtusepi32_epi8 (__m512i a)

Synopsis

__m128i _mm512_cvtusepi32_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovusdb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j k := 8*j dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:128] := 0
vpmovusdb
__m128i _mm512_mask_cvtusepi32_epi8 (__m128i src, __mmask16 k, __m512i a)

Synopsis

__m128i _mm512_mask_cvtusepi32_epi8 (__m128i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusdb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
vpmovusdb
__m128i _mm512_maskz_cvtusepi32_epi8 (__mmask16 k, __m512i a)

Synopsis

__m128i _mm512_maskz_cvtusepi32_epi8 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusdb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovusdw
void _mm_mask_cvtusepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtusepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) FI ENDFOR dst[MAX:64] := 0
vpmovusdw
void _mm256_mask_cvtusepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtusepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) FI ENDFOR dst[MAX:128] := 0
vpmovusdw
void _mm512_mask_cvtusepi32_storeu_epi16 (void* base_addr, __mmask16 k, __m512i a)

Synopsis

void _mm512_mask_cvtusepi32_storeu_epi16 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusdw m256 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) FI ENDFOR
vpmovusdb
void _mm_mask_cvtusepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtusepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) FI ENDFOR dst[MAX:32] := 0
vpmovusdb
void _mm256_mask_cvtusepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtusepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) FI ENDFOR dst[MAX:64] := 0
vpmovusdb
void _mm512_mask_cvtusepi32_storeu_epi8 (void* base_addr, __mmask16 k, __m512i a)

Synopsis

void _mm512_mask_cvtusepi32_storeu_epi8 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusdb m128 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) FI ENDFOR
vpmovusqw
__m128i _mm_cvtusepi64_epi16 (__m128i a)

Synopsis

__m128i _mm_cvtusepi64_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 16*j dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:32] := 0
vpmovusqw
__m128i _mm_mask_cvtusepi64_epi16 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtusepi64_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:32] := 0
vpmovusqw
__m128i _mm_maskz_cvtusepi64_epi16 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtusepi64_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:32] := 0
vpmovusqw
__m128i _mm256_cvtusepi64_epi16 (__m256i a)

Synopsis

__m128i _mm256_cvtusepi64_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 16*j dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:64] := 0
vpmovusqw
__m128i _mm256_mask_cvtusepi64_epi16 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtusepi64_epi16 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:64] := 0
vpmovusqw
__m128i _mm256_maskz_cvtusepi64_epi16 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtusepi64_epi16 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovusqw
__m128i _mm512_cvtusepi64_epi16 (__m512i a)

Synopsis

__m128i _mm512_cvtusepi64_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpmovusqw xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 16*j dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vpmovusqw
__m128i _mm512_mask_cvtusepi64_epi16 (__m128i src, __mmask8 k, __m512i a)

Synopsis

__m128i _mm512_mask_cvtusepi64_epi16 (__m128i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqw xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
vpmovusqw
__m128i _mm512_maskz_cvtusepi64_epi16 (__mmask8 k, __m512i a)

Synopsis

__m128i _mm512_maskz_cvtusepi64_epi16 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqw xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovusqd
__m128i _mm_cvtusepi64_epi32 (__m128i a)

Synopsis

__m128i _mm_cvtusepi64_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 32*j dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:64] := 0
vpmovusqd
__m128i _mm_mask_cvtusepi64_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtusepi64_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:64] := 0
vpmovusqd
__m128i _mm_maskz_cvtusepi64_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtusepi64_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovusqd
__m128i _mm256_cvtusepi64_epi32 (__m256i a)

Synopsis

__m128i _mm256_cvtusepi64_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 32*j dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vpmovusqd
__m128i _mm256_mask_cvtusepi64_epi32 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtusepi64_epi32 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:128] := 0
vpmovusqd
__m128i _mm256_maskz_cvtusepi64_epi32 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtusepi64_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovusqd
__m256i _mm512_cvtusepi64_epi32 (__m512i a)

Synopsis

__m256i _mm512_cvtusepi64_epi32 (__m512i a)
#include "immintrin.h"
Instruction: vpmovusqd ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 32*j dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vpmovusqd
__m256i _mm512_mask_cvtusepi64_epi32 (__m256i src, __mmask8 k, __m512i a)

Synopsis

__m256i _mm512_mask_cvtusepi64_epi32 (__m256i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqd ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:256] := 0
vpmovusqd
__m256i _mm512_maskz_cvtusepi64_epi32 (__mmask8 k, __m512i a)

Synopsis

__m256i _mm512_maskz_cvtusepi64_epi32 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqd ymm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovusqb
__m128i _mm_cvtusepi64_epi8 (__m128i a)

Synopsis

__m128i _mm_cvtusepi64_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j k := 8*j dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:16] := 0
vpmovusqb
__m128i _mm_mask_cvtusepi64_epi8 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_cvtusepi64_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:16] := 0
vpmovusqb
__m128i _mm_maskz_cvtusepi64_epi8 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_cvtusepi64_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:16] := 0
vpmovusqb
__m128i _mm256_cvtusepi64_epi8 (__m256i a)

Synopsis

__m128i _mm256_cvtusepi64_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j k := 8*j dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:32] := 0
vpmovusqb
__m128i _mm256_mask_cvtusepi64_epi8 (__m128i src, __mmask8 k, __m256i a)

Synopsis

__m128i _mm256_mask_cvtusepi64_epi8 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:32] := 0
vpmovusqb
__m128i _mm256_maskz_cvtusepi64_epi8 (__mmask8 k, __m256i a)

Synopsis

__m128i _mm256_maskz_cvtusepi64_epi8 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:32] := 0
vpmovusqb
__m128i _mm512_cvtusepi64_epi8 (__m512i a)

Synopsis

__m128i _mm512_cvtusepi64_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovusqb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j k := 8*j dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:64] := 0
vpmovusqb
__m128i _mm512_mask_cvtusepi64_epi8 (__m128i src, __mmask8 k, __m512i a)

Synopsis

__m128i _mm512_mask_cvtusepi64_epi8 (__m128i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
vpmovusqb
__m128i _mm512_maskz_cvtusepi64_epi8 (__mmask8 k, __m512i a)

Synopsis

__m128i _mm512_maskz_cvtusepi64_epi8 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqb xmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
vpmovusqw
void _mm_mask_cvtusepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtusepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) FI ENDFOR dst[MAX:32] := 0
vpmovusqw
void _mm256_mask_cvtusepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtusepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) FI ENDFOR dst[MAX:64] := 0
vpmovusqw
void _mm512_mask_cvtusepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m512i a)

Synopsis

void _mm512_mask_cvtusepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqw m128 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) FI ENDFOR
vpmovusqd
void _mm_mask_cvtusepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtusepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] MEM[base_addr+l+31:base_addr+l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) FI ENDFOR dst[MAX:64] := 0
vpmovusqd
void _mm256_mask_cvtusepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtusepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] MEM[base_addr+l+31:base_addr+l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) FI ENDFOR dst[MAX:128] := 0
vpmovusqd
void _mm512_mask_cvtusepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m512i a)

Synopsis

void _mm512_mask_cvtusepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqd m256 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] MEM[base_addr+l+31:base_addr+l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) FI ENDFOR
vpmovusqb
void _mm_mask_cvtusepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_cvtusepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) FI ENDFOR dst[MAX:16] := 0
vpmovusqb
void _mm256_mask_cvtusepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_cvtusepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) FI ENDFOR dst[MAX:32] := 0
vpmovusqb
void _mm512_mask_cvtusepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m512i a)

Synopsis

void _mm512_mask_cvtusepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqb m64 {k}, zmm
CPUID Flags: AVX512F

Description

Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.

Operation

FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) FI ENDFOR
vdbpsadbw
__m128i _mm_dbsad_epu8 (__m128i a, __m128i b, int imm8)

Synopsis

__m128i _mm_dbsad_epu8 (__m128i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

Operation

tmp[31:0] := select(b[127:0], imm8[1:0]) tmp[63:32] := select(b[127:0], imm8[3:2]) tmp[95:64] := select(b[127:0], imm8[5:4]) tmp[127:96] := select(b[127:0], imm8[7:6]) FOR j := 0 to 1 i := j*64 dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) ENDFOR dst[MAX:128] := 0
vdbpsadbw
__m128i _mm_mask_dbsad_epu8 (__m128i src, __mmask8 k, __m128i a, __m128i b, int imm8)

Synopsis

__m128i _mm_mask_dbsad_epu8 (__m128i src, __mmask8 k, __m128i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

Operation

tmp[31:0] := select(b[127:0], imm8[1:0]) tmp[63:32] := select(b[127:0], imm8[3:2]) tmp[95:64] := select(b[127:0], imm8[5:4]) tmp[127:96] := select(b[127:0], imm8[7:6]) FOR j := 0 to 1 i := j*64 tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) ENDFOR FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vdbpsadbw
__m128i _mm_maskz_dbsad_epu8 (__mmask8 k, __m128i a, __m128i b, int imm8)

Synopsis

__m128i _mm_maskz_dbsad_epu8 (__mmask8 k, __m128i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

Operation

tmp[31:0] := select(b[127:0], imm8[1:0]) tmp[63:32] := select(b[127:0], imm8[3:2]) tmp[95:64] := select(b[127:0], imm8[5:4]) tmp[127:96] := select(b[127:0], imm8[7:6]) FOR j := 0 to 1 i := j*64 tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) ENDFOR FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vdbpsadbw
__m256i _mm256_dbsad_epu8 (__m256i a, __m256i b, int imm8)

Synopsis

__m256i _mm256_dbsad_epu8 (__m256i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

Operation

FOR j := 0 to 1 i := j*128 tmp[i+31:i] := select(b[i+127:i], imm8[1:0]) tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2]) tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4]) tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6]) ENDFOR FOR j := 0 to 3 i := j*64 dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) ENDFOR dst[MAX:256] := 0
vdbpsadbw
__m256i _mm256_mask_dbsad_epu8 (__m256i src, __mmask16 k, __m256i a, __m256i b, int imm8)

Synopsis

__m256i _mm256_mask_dbsad_epu8 (__m256i src, __mmask16 k, __m256i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

Operation

FOR j := 0 to 1 i := j*128 tmp[i+31:i] := select(b[i+127:i], imm8[1:0]) tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2]) tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4]) tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6]) ENDFOR FOR j := 0 to 3 i := j*64 tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) ENDFOR FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vdbpsadbw
__m256i _mm256_maskz_dbsad_epu8 (__mmask16 k, __m256i a, __m256i b, int imm8)

Synopsis

__m256i _mm256_maskz_dbsad_epu8 (__mmask16 k, __m256i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

Operation

FOR j := 0 to 1 i := j*128 tmp[i+31:i] := select(b[i+127:i], imm8[1:0]) tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2]) tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4]) tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6]) ENDFOR FOR j := 0 to 3 i := j*64 tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) ENDFOR FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vdbpsadbw
__m512i _mm512_dbsad_epu8 (__m512i a, __m512i b, int imm8)

Synopsis

__m512i _mm512_dbsad_epu8 (__m512i a, __m512i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512BW

Description

Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

Operation

FOR j := 0 to 3 i := j*128 tmp[i+31:i] := select(b[i+127:i], imm8[1:0]) tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2]) tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4]) tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6]) ENDFOR FOR j := 0 to 7 i := j*64 dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) ENDFOR dst[MAX:512] := 0
vdbpsadbw
__m512i _mm512_mask_dbsad_epu8 (__m512i src, __mmask32 k, __m512i a, __m512i b, int imm8)

Synopsis

__m512i _mm512_mask_dbsad_epu8 (__m512i src, __mmask32 k, __m512i a, __m512i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512BW

Description

Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

Operation

FOR j := 0 to 3 i := j*128 tmp[i+31:i] := select(b[i+127:i], imm8[1:0]) tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2]) tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4]) tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6]) ENDFOR FOR j := 0 to 7 i := j*64 tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) ENDFOR FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vdbpsadbw
__m512i _mm512_maskz_dbsad_epu8 (__mmask32 k, __m512i a, __m512i b, int imm8)

Synopsis

__m512i _mm512_maskz_dbsad_epu8 (__mmask32 k, __m512i a, __m512i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512BW

Description

Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

Operation

FOR j := 0 to 3 i := j*128 tmp[i+31:i] := select(b[i+127:i], imm8[1:0]) tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2]) tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4]) tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6]) ENDFOR FOR j := 0 to 7 i := j*64 tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) ENDFOR FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
delay
void _mm_delay_32 (unsigned int r1)

Synopsis

void _mm_delay_32 (unsigned int r1)
#include "immintrin.h"
Instruction: delay r32
CPUID Flags: KNCNI

Description

Stalls a thread without blocking other threads for 32-bit unsigned integer r1 clock cycles.

Operation

BlockThread(r1)
delay
void _mm_delay_64 (unsigned __int64 r1)

Synopsis

void _mm_delay_64 (unsigned __int64 r1)
#include "immintrin.h"
Instruction: delay r64
CPUID Flags: KNCNI

Description

Stalls a thread without blocking other threads for 64-bit unsigned integer r1 clock cycles.

Operation

BlockThread(r1)
...
__m128i _mm_div_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_div_epi16 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed 16-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 7 i := 16*j dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_div_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_div_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed 16-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 15 i := 16*j dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_div_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_div_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed 16-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 31 i := 16*j dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:512] := 0
...
__m128i _mm_div_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_div_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed 32-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_div_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_div_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed 32-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_div_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_div_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed 32-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512i _mm512_mask_div_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_div_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed 32-bit integers in a by packed elements in b, and store the truncated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128i _mm_div_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_div_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed 64-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 1 i := 64*j dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_div_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_div_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed 64-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 3 i := 64*j dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_div_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_div_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed 64-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 7 i := 64*j dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128i _mm_div_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_div_epi8 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed 8-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 15 i := 8*j dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_div_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_div_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed 8-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 31 i := 8*j dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_div_epi8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_div_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed 8-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 63 i := 8*j dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:512] := 0
...
__m128i _mm_div_epu16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_div_epu16 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed unsigned 16-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 7 i := 16*j dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_div_epu16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_div_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed unsigned 16-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 15 i := 16*j dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_div_epu16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_div_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed unsigned 16-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 31 i := 16*j dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:512] := 0
...
__m128i _mm_div_epu32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_div_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_div_epu32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_div_epu32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_div_epu32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_div_epu32 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512i _mm512_mask_div_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_div_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, and store the truncated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128i _mm_div_epu64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_div_epu64 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed unsigned 64-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 1 i := 64*j dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_div_epu64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_div_epu64 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed unsigned 64-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 3 i := 64*j dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_div_epu64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_div_epu64 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed unsigned 64-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 7 i := 64*j dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128i _mm_div_epu8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_div_epu8 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed unsigned 8-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 15 i := 8*j dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_div_epu8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_div_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed unsigned 8-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 31 i := 8*j dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_div_epu8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_div_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed unsigned 8-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 63 i := 8*j dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:512] := 0
divpd
__m128d _mm_div_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_div_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: divpd xmm, xmm
CPUID Flags: SSE2

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 1 i := 64*j dst[i+63:i] := a[i+63:i] / b[i+63:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell2012
Ivy Bridge2014
Sandy Bridge2122
Westmere2120
Nehalem2120
vdivpd
__m128d _mm_mask_div_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_div_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vdivpd
CPUID Flags: AVX512VL + AVX512F

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vdivpd
__m128d _mm_maskz_div_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_div_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vdivpd
CPUID Flags: AVX512VL + AVX512F

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vdivpd
__m256d _mm256_div_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_div_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vdivpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 3 i := 64*j dst[i+63:i] := a[i+63:i] / b[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3525
Ivy Bridge3528
Sandy Bridge4344
vdivpd
__m256d _mm256_mask_div_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_div_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vdivpd
CPUID Flags: AVX512VL + AVX512F

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vdivpd
__m256d _mm256_maskz_div_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_div_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vdivpd
CPUID Flags: AVX512VL + AVX512F

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vdivpd
__m512d _mm512_div_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_div_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vdivpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 7 i := 64*j dst[i+63:i] := a[i+63:i] / b[i+63:i] ENDFOR dst[MAX:512] := 0
vdivpd
__m512d _mm512_mask_div_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_div_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vdivpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vdivpd
__m512d _mm512_maskz_div_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_div_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vdivpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
divps
__m128 _mm_div_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_div_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: divps xmm, xmm
CPUID Flags: SSE

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := a[i+31:i] / b[i+31:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell135
Ivy Bridge136
Sandy Bridge1414
Westmere1412
Nehalem1412
vdivps
__m128 _mm_mask_div_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_div_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vdivps
CPUID Flags: AVX512VL + AVX512F

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vdivps
__m128 _mm_maskz_div_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_div_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vdivps
CPUID Flags: AVX512VL + AVX512F

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vdivps
__m256 _mm256_div_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_div_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vdivps ymm, ymm, ymm
CPUID Flags: AVX

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := a[i+31:i] / b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell2113
Ivy Bridge2114
Sandy Bridge2928
vdivps
__m256 _mm256_mask_div_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_div_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vdivps
CPUID Flags: AVX512VL + AVX512F

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vdivps
__m256 _mm256_maskz_div_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_div_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vdivps
CPUID Flags: AVX512VL + AVX512F

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vdivps
__m512 _mm512_div_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_div_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vdivps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := a[i+31:i] / b[i+31:i] ENDFOR dst[MAX:512] := 0
vdivps
__m512 _mm512_mask_div_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_div_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vdivps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vdivps
__m512 _mm512_maskz_div_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_div_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vdivps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vdivpd
__m512d _mm512_div_round_pd (__m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_div_round_pd (__m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vdivpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, =and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 64*j dst[i+63:i] := a[i+63:i] / b[i+63:i] ENDFOR dst[MAX:512] := 0
vdivpd
__m512d _mm512_mask_div_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_mask_div_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vdivpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vdivpd
__m512d _mm512_maskz_div_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_maskz_div_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vdivpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vdivps
__m512 _mm512_div_round_ps (__m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_div_round_ps (__m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vdivps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := a[i+31:i] / b[i+31:i] ENDFOR dst[MAX:512] := 0
vdivps
__m512 _mm512_mask_div_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_mask_div_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vdivps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vdivps
__m512 _mm512_maskz_div_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_maskz_div_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vdivps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vdivsd
__m128d _mm_div_round_sd (__m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_div_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vdivsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := a[63:0] / b[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0
vdivsd
__m128d _mm_mask_div_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_mask_div_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vdivsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := a[63:0] / b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vdivsd
__m128d _mm_maskz_div_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_maskz_div_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vdivsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := a[63:0] / b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vdivss
__m128 _mm_div_round_ss (__m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_div_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vdivss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := a[31:0] / b[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0
vdivss
__m128 _mm_mask_div_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_mask_div_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vdivss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := a[31:0] / b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vdivss
__m128 _mm_maskz_div_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_maskz_div_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vdivss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := a[31:0] / b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
divsd
__m128d _mm_div_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_div_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: divsd xmm, xmm
CPUID Flags: SSE2

Description

Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := a[63:0] 0 b[63:0] dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell2012
Ivy Bridge2014
Sandy Bridge2122
Westmere2120
Nehalem2120
vdivsd
__m128d _mm_mask_div_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_div_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vdivsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := a[63:0] / b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vdivsd
__m128d _mm_maskz_div_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_div_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vdivsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := a[63:0] / b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
divss
__m128 _mm_div_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_div_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: divss xmm, xmm
CPUID Flags: SSE

Description

Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := a[31:0] / b[31:0] dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell135
Ivy Bridge136
Sandy Bridge1414
Westmere1412
Nehalem1412
vdivss
__m128 _mm_mask_div_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_div_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vdivss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := a[31:0] / b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vdivss
__m128 _mm_maskz_div_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_div_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vdivss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := a[31:0] / b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
dppd
__m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8)

Synopsis

__m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8)
#include "smmintrin.h"
Instruction: dppd xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Conditionally multiply the packed double-precision (64-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8.

Operation

DP(a[127:0], b[127:0], imm8[7:0]) { FOR j := 0 to 1 i := j*64 IF imm8[(4+j)%8]] temp[i+63:i] := a[i+63:i] * b[i+63:i] ELSE temp[i+63:i] := 0 FI ENDFOR sum[63:0] := temp[127:64] + temp[63:0] FOR j := 0 to 1 i := j*64 IF imm8[j%8] tmpdst[i+63:i] := sum[63:0] ELSE tmpdst[i+63:i] := 0 FI ENDFOR RETURN tmpdst[127:0] } dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])

Performance

ArchitectureLatencyThroughput
Haswell9-
Ivy Bridge9-
Sandy Bridge9-
Westmere9-
Nehalem9-
dpps
__m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8)

Synopsis

__m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8)
#include "smmintrin.h"
Instruction: dpps xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Conditionally multiply the packed single-precision (32-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8.

Operation

DP(a[127:0], b[127:0], imm8[7:0]) { FOR j := 0 to 3 i := j*32 IF imm8[(4+j)%8] temp[i+31:i] := a[i+31:i] * b[i+31:i] ELSE temp[i+31:i] := 0 FI ENDFOR sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0]) FOR j := 0 to 3 i := j*32 IF imm8[j%8] tmpdst[i+31:i] := sum[31:0] ELSE tmpdst[i+31:i] := 0 FI ENDFOR RETURN tmpdst[127:0] } dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])

Performance

ArchitectureLatencyThroughput
Haswell14-
Ivy Bridge12-
Sandy Bridge12-
Westmere11-
Nehalem11-
vdpps
__m256 _mm256_dp_ps (__m256 a, __m256 b, const int imm8)

Synopsis

__m256 _mm256_dp_ps (__m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vdpps ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Conditionally multiply the packed single-precision (32-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8.

Operation

DP(a[127:0], b[127:0], imm8[7:0]) { FOR j := 0 to 3 i := j*32 IF imm8[(4+j)%8] temp[i+31:i] := a[i+31:i] * b[i+31:i] ELSE temp[i+31:i] := 0 FI ENDFOR sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0]) FOR j := 0 to 3 i := j*32 IF imm8[j%8] tmpdst[i+31:i] := sum[31:0] ELSE tmpdst[i+31:i] := 0 FI ENDFOR RETURN tmpdst[127:0] } dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0]) dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell142
Ivy Bridge122
Sandy Bridge122
...
__m128d _mm_erf_pd (__m128d a)

Synopsis

__m128d _mm_erf_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ERF(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_erf_pd (__m256d a)

Synopsis

__m256d _mm256_erf_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ERF(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_erf_pd (__m512d a)

Synopsis

__m512d _mm512_erf_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ERF(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_erf_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_erf_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ERF(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_erf_ps (__m128 a)

Synopsis

__m128 _mm_erf_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ERF(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_erf_ps (__m256 a)

Synopsis

__m256 _mm256_erf_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ERF(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_erf_ps (__m512 a)

Synopsis

__m512 _mm512_erf_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ERF(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_erf_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_erf_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ERF(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_erfc_pd (__m128d a)

Synopsis

__m128d _mm_erfc_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := 1.0 - ERF(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_erfc_pd (__m256d a)

Synopsis

__m256d _mm256_erfc_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := 1.0 - ERF(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_erfc_pd (__m512d a)

Synopsis

__m512d _mm512_erfc_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := 1.0 - ERF(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_erfc_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_erfc_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := 1.0 - ERF(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_erfc_ps (__m128 a)

Synopsis

__m128 _mm_erfc_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := 1.0 - ERF(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_erfc_ps (__m256 a)

Synopsis

__m256 _mm256_erfc_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := 1.0 - ERF(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_erfc_ps (__m512 a)

Synopsis

__m512 _mm512_erfc_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := 1.0 - ERF(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_erfc_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_erfc_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := 1.0 - ERF(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_erfcinv_pd (__m128d a)

Synopsis

__m128d _mm_erfcinv_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_erfcinv_pd (__m256d a)

Synopsis

__m256d _mm256_erfcinv_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_erfcinv_pd (__m512d a)

Synopsis

__m512d _mm512_erfcinv_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_erfcinv_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_erfcinv_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_erfcinv_ps (__m128 a)

Synopsis

__m128 _mm_erfcinv_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i])) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_erfcinv_ps (__m256 a)

Synopsis

__m256 _mm256_erfcinv_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i])) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_erfcinv_ps (__m512 a)

Synopsis

__m512 _mm512_erfcinv_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i])) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_erfcinv_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_erfcinv_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i])) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_erfinv_pd (__m128d a)

Synopsis

__m128d _mm_erfinv_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := 1.0 / ERF(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_erfinv_pd (__m256d a)

Synopsis

__m256d _mm256_erfinv_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := 1.0 / ERF(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_erfinv_pd (__m512d a)

Synopsis

__m512d _mm512_erfinv_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := 1.0 / ERF(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_erfinv_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_erfinv_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := 1.0 / ERF(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_erfinv_ps (__m128 a)

Synopsis

__m128 _mm_erfinv_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := 1.0 / ERF(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_erfinv_ps (__m256 a)

Synopsis

__m256 _mm256_erfinv_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := 1.0 / ERF(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_erfinv_ps (__m512 a)

Synopsis

__m512 _mm512_erfinv_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := 1.0 / ERF(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_erfinv_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_erfinv_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := 1.0 / ERF(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_exp_pd (__m128d a)

Synopsis

__m128d _mm_exp_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := e^(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_exp_pd (__m256d a)

Synopsis

__m256d _mm256_exp_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := e^(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_exp_pd (__m512d a)

Synopsis

__m512d _mm512_exp_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := e^(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_exp_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_exp_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := e^(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_exp_ps (__m128 a)

Synopsis

__m128 _mm_exp_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := e^(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_exp_ps (__m256 a)

Synopsis

__m256 _mm256_exp_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := e^(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_exp_ps (__m512 a)

Synopsis

__m512 _mm512_exp_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := e^(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_exp_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_exp_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := e^(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_exp10_pd (__m128d a)

Synopsis

__m128d _mm_exp10_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := 10^(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_exp10_pd (__m256d a)

Synopsis

__m256d _mm256_exp10_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := 10^(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_exp10_pd (__m512d a)

Synopsis

__m512d _mm512_exp10_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := 10^(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_exp10_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_exp10_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := 10^(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_exp10_ps (__m128 a)

Synopsis

__m128 _mm_exp10_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := 10^(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_exp10_ps (__m256 a)

Synopsis

__m256 _mm256_exp10_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := 10^(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_exp10_ps (__m512 a)

Synopsis

__m512 _mm512_exp10_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := 10^(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_exp10_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_exp10_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := 10^(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_exp2_pd (__m128d a)

Synopsis

__m128d _mm_exp2_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := 2^(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_exp2_pd (__m256d a)

Synopsis

__m256d _mm256_exp2_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := 2^(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_exp2_pd (__m512d a)

Synopsis

__m512d _mm512_exp2_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := 2^(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_exp2_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_exp2_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := 2^(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_exp2_ps (__m128 a)

Synopsis

__m128 _mm_exp2_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := 2^(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_exp2_ps (__m256 a)

Synopsis

__m256 _mm256_exp2_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := 2^(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_exp2_ps (__m512 a)

Synopsis

__m512 _mm512_exp2_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := 2^(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_exp2_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_exp2_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := 2^(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vexp223ps
__m512 _mm512_exp223_ps (__m512i v2)

Synopsis

__m512 _mm512_exp223_ps (__m512i v2)
#include "immintrin.h"
Instruction: vexp223ps zmm {k}, zmm
CPUID Flags: KNCNI

Description

Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in v2 with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := exp223(v2[i+31:i]) ENDFOR dst[MAX:512] := 0
vexp223ps
__m512 _mm512_mask_exp223_ps (__m512 src, __mmask16 k, __m512i v2)

Synopsis

__m512 _mm512_mask_exp223_ps (__m512 src, __mmask16 k, __m512i v2)
#include "immintrin.h"
Instruction: vexp223ps zmm {k}, zmm
CPUID Flags: KNCNI

Description

Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in v2 with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := exp223(v2[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vexp2pd
__m512d _mm512_exp2a23_pd (__m512d a)

Synopsis

__m512d _mm512_exp2a23_pd (__m512d a)
#include "immintrin.h"
Instruction: vexp2pd zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-23.

Operation

FOR j := 0 to 7 i := j*64; dst[i+63:i] := EXP_2_23_DP(a[i+63:i]); ENDFOR; dst[MAX:512] := 0
vexp2pd
__m512d _mm512_mask_exp2a23_pd (__m512d a, __mmask8 k, __m512d src)

Synopsis

__m512d _mm512_mask_exp2a23_pd (__m512d a, __mmask8 k, __m512d src)
#include "immintrin.h"
Instruction: vexp2pd zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.

Operation

FOR j := 0 to 7 i := j*64; IF k[j] THEN dst[i+63:i] := EXP_2_23_DP(a[i+63:i]); ELSE dst[i+63:i] := src[i+63:i]; FI ENDFOR; dst[MAX:512] := 0
vexp2pd
__m512d _mm512_maskz_exp2a23_pd (__mmask8 k, __m512d a)

Synopsis

__m512d _mm512_maskz_exp2a23_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vexp2pd zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.

Operation

FOR j := 0 to 7 i := j*64; IF k[j] THEN dst[i+63:i] := EXP_2_23_DP(a[i+63:i]); ELSE dst[i+63:i] := 0; FI ENDFOR; dst[MAX:512] := 0
vexp2ps
__m512 _mm512_exp2a23_ps (__m512 a)

Synopsis

__m512 _mm512_exp2a23_ps (__m512 a)
#include "immintrin.h"
Instruction: vexp2ps zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-23.

Operation

FOR j := 0 to 15 i := j*32; dst[i+31:i] := EXP_2_23_SP(a[i+31:i]); ENDFOR; dst[MAX:512] := 0
vexp2ps
__m512 _mm512_mask_exp2a23_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_exp2a23_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vexp2ps zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.

Operation

FOR j := 0 to 15 i := j*32; IF k[j] THEN dst[i+31:i] := EXP_2_23_SP(a[i+31:i]); ELSE dst[i*31:i] := src[i*31:i]; FI ENDFOR; dst[MAX:512] := 0
vexp2ps
__m512 _mm512_maskz_exp2a23_ps (__mmask16 k, __m512 a)

Synopsis

__m512 _mm512_maskz_exp2a23_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vexp2ps zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.

Operation

FOR j := 0 to 15 i := j*32; IF k[j] THEN dst[i+31:i] := EXP_2_23_SP(a[i+31:i]); ELSE dst[i*31:i] := 0; FI ENDFOR; dst[MAX:512] := 0
vexp2pd
__m512d _mm512_exp2a23_round_pd (__m512d a, int rounding)

Synopsis

__m512d _mm512_exp2a23_round_pd (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vexp2pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-23.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64; dst[i+63:i] := EXP_2_23_DP(a[i+63:i]); ENDFOR; dst[MAX:512] := 0
vexp2pd
__m512d _mm512_mask_exp2a23_round_pd (__m512d a, __mmask8 k, __m512d src, int rounding)

Synopsis

__m512d _mm512_mask_exp2a23_round_pd (__m512d a, __mmask8 k, __m512d src, int rounding)
#include "immintrin.h"
Instruction: vexp2pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64; IF k[j] THEN dst[i+63:i] := EXP_2_23_DP(a[i+63:i]); ELSE dst[i+63:i] := src[i+63:i]; FI ENDFOR; dst[MAX:512] := 0
vexp2pd
__m512d _mm512_maskz_exp2a23_round_pd (__mmask8 k, __m512d a, int rounding)

Synopsis

__m512d _mm512_maskz_exp2a23_round_pd (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vexp2pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64; IF k[j] THEN dst[i+63:i] := EXP_2_23_DP(a[i+63:i]); ELSE dst[i+63:i] := 0; FI ENDFOR; dst[MAX:512] := 0
vexp2ps
__m512 _mm512_exp2a23_round_ps (__m512 a, int rounding)

Synopsis

__m512 _mm512_exp2a23_round_ps (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vexp2ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-23.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32; dst[i+31:i] := EXP_2_23_SP(a[i+31:i]); ENDFOR; dst[MAX:512] := 0
vexp2ps
__m512 _mm512_mask_exp2a23_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)

Synopsis

__m512 _mm512_mask_exp2a23_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vexp2ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32; IF k[j] THEN dst[i+31:i] := EXP_2_23_SP(a[i+31:i]); ELSE dst[i*31:i] := src[i*31:i]; FI ENDFOR; dst[MAX:512] := 0
vexp2ps
__m512 _mm512_maskz_exp2a23_round_ps (__mmask16 k, __m512 a, int rounding)

Synopsis

__m512 _mm512_maskz_exp2a23_round_ps (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vexp2ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32; IF k[j] THEN dst[i+31:i] := EXP_2_23_SP(a[i+31:i]); ELSE dst[i*31:i] := 0; FI ENDFOR; dst[MAX:512] := 0
vpexpandd
__m128i _mm_mask_expand_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_expand_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpexpandd
__m128i _mm_maskz_expand_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_expand_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpexpandd
__m256i _mm256_mask_expand_epi32 (__m256i src, __mmask8 k, __m256i a)

Synopsis

__m256i _mm256_mask_expand_epi32 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpexpandd
__m256i _mm256_maskz_expand_epi32 (__mmask8 k, __m256i a)

Synopsis

__m256i _mm256_maskz_expand_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpexpandd
__m512i _mm512_mask_expand_epi32 (__m512i src, __mmask16 k, __m512i a)

Synopsis

__m512i _mm512_mask_expand_epi32 (__m512i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpexpandd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpexpandd
__m512i _mm512_maskz_expand_epi32 (__mmask16 k, __m512i a)

Synopsis

__m512i _mm512_maskz_expand_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpexpandd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpexpandq
__m128i _mm_mask_expand_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_expand_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpexpandq
__m128i _mm_maskz_expand_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_expand_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpexpandq
__m256i _mm256_mask_expand_epi64 (__m256i src, __mmask8 k, __m256i a)

Synopsis

__m256i _mm256_mask_expand_epi64 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpexpandq
__m256i _mm256_maskz_expand_epi64 (__mmask8 k, __m256i a)

Synopsis

__m256i _mm256_maskz_expand_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpexpandq
__m512i _mm512_mask_expand_epi64 (__m512i src, __mmask8 k, __m512i a)

Synopsis

__m512i _mm512_mask_expand_epi64 (__m512i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpexpandq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpexpandq
__m512i _mm512_maskz_expand_epi64 (__mmask8 k, __m512i a)

Synopsis

__m512i _mm512_maskz_expand_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpexpandq zmm {k}, zmm
CPUID Flags: AVX512F

Description

Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vexpandpd
__m128d _mm_mask_expand_pd (__m128d src, __mmask8 k, __m128d a)

Synopsis

__m128d _mm_mask_expand_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vexpandpd
__m128d _mm_maskz_expand_pd (__mmask8 k, __m128d a)

Synopsis

__m128d _mm_maskz_expand_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vexpandpd
__m256d _mm256_mask_expand_pd (__m256d src, __mmask8 k, __m256d a)

Synopsis

__m256d _mm256_mask_expand_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vexpandpd
__m256d _mm256_maskz_expand_pd (__mmask8 k, __m256d a)

Synopsis

__m256d _mm256_maskz_expand_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vexpandpd
__m512d _mm512_mask_expand_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_expand_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vexpandpd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vexpandpd
__m512d _mm512_maskz_expand_pd (__mmask8 k, __m512d a)

Synopsis

__m512d _mm512_maskz_expand_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vexpandpd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vexpandps
__m128 _mm_mask_expand_ps (__m128 src, __mmask8 k, __m128 a)

Synopsis

__m128 _mm_mask_expand_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vexpandps
__m128 _mm_maskz_expand_ps (__mmask8 k, __m128 a)

Synopsis

__m128 _mm_maskz_expand_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vexpandps
__m256 _mm256_mask_expand_ps (__m256 src, __mmask8 k, __m256 a)

Synopsis

__m256 _mm256_mask_expand_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vexpandps
__m256 _mm256_maskz_expand_ps (__mmask8 k, __m256 a)

Synopsis

__m256 _mm256_maskz_expand_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vexpandps
__m512 _mm512_mask_expand_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_expand_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vexpandps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vexpandps
__m512 _mm512_maskz_expand_ps (__mmask16 k, __m512 a)

Synopsis

__m512 _mm512_maskz_expand_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vexpandps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpexpandd
__m128i _mm_mask_expandloadu_epi32 (__m128i src, __mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_mask_expandloadu_epi32 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpexpandd
__m128i _mm_maskz_expandloadu_epi32 (__mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_maskz_expandloadu_epi32 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpexpandd
__m256i _mm256_mask_expandloadu_epi32 (__m256i src, __mmask8 k, void const* mem_addr)

Synopsis

__m256i _mm256_mask_expandloadu_epi32 (__m256i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpexpandd
__m256i _mm256_maskz_expandloadu_epi32 (__mmask8 k, void const* mem_addr)

Synopsis

__m256i _mm256_maskz_expandloadu_epi32 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpexpandd
__m512i _mm512_mask_expandloadu_epi32 (__m512i src, __mmask16 k, void const* mem_addr)

Synopsis

__m512i _mm512_mask_expandloadu_epi32 (__m512i src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandd zmm {k}, m32
CPUID Flags: AVX512F

Description

Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpexpandd
__m512i _mm512_maskz_expandloadu_epi32 (__mmask16 k, void const* mem_addr)

Synopsis

__m512i _mm512_maskz_expandloadu_epi32 (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandd zmm {k}, m32
CPUID Flags: AVX512F

Description

Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpexpandq
__m128i _mm_mask_expandloadu_epi64 (__m128i src, __mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_mask_expandloadu_epi64 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpexpandq
__m128i _mm_maskz_expandloadu_epi64 (__mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_maskz_expandloadu_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpexpandq
__m256i _mm256_mask_expandloadu_epi64 (__m256i src, __mmask8 k, void const* mem_addr)

Synopsis

__m256i _mm256_mask_expandloadu_epi64 (__m256i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpexpandq
__m256i _mm256_maskz_expandloadu_epi64 (__mmask8 k, void const* mem_addr)

Synopsis

__m256i _mm256_maskz_expandloadu_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpexpandq
__m512i _mm512_mask_expandloadu_epi64 (__m512i src, __mmask8 k, void const* mem_addr)

Synopsis

__m512i _mm512_mask_expandloadu_epi64 (__m512i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandq zmm {k}, m64
CPUID Flags: AVX512F

Description

Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpexpandq
__m512i _mm512_maskz_expandloadu_epi64 (__mmask8 k, void const* mem_addr)

Synopsis

__m512i _mm512_maskz_expandloadu_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandq zmm {k}, m64
CPUID Flags: AVX512F

Description

Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vexpandpd
__m128d _mm_mask_expandloadu_pd (__m128d src, __mmask8 k, void const* mem_addr)

Synopsis

__m128d _mm_mask_expandloadu_pd (__m128d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vexpandpd
__m128d _mm_maskz_expandloadu_pd (__mmask8 k, void const* mem_addr)

Synopsis

__m128d _mm_maskz_expandloadu_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vexpandpd
__m256d _mm256_mask_expandloadu_pd (__m256d src, __mmask8 k, void const* mem_addr)

Synopsis

__m256d _mm256_mask_expandloadu_pd (__m256d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vexpandpd
__m256d _mm256_maskz_expandloadu_pd (__mmask8 k, void const* mem_addr)

Synopsis

__m256d _mm256_maskz_expandloadu_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vexpandpd
__m512d _mm512_mask_expandloadu_pd (__m512d src, __mmask8 k, void const* mem_addr)

Synopsis

__m512d _mm512_mask_expandloadu_pd (__m512d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandpd zmm {k}, m512
CPUID Flags: AVX512F

Description

Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vexpandpd
__m512d _mm512_maskz_expandloadu_pd (__mmask8 k, void const* mem_addr)

Synopsis

__m512d _mm512_maskz_expandloadu_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandpd zmm {k}, m512
CPUID Flags: AVX512F

Description

Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vexpandps
__m128 _mm_mask_expandloadu_ps (__m128 src, __mmask8 k, void const* mem_addr)

Synopsis

__m128 _mm_mask_expandloadu_ps (__m128 src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vexpandps
__m128 _mm_maskz_expandloadu_ps (__mmask8 k, void const* mem_addr)

Synopsis

__m128 _mm_maskz_expandloadu_ps (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vexpandps
__m256 _mm256_mask_expandloadu_ps (__m256 src, __mmask8 k, void const* mem_addr)

Synopsis

__m256 _mm256_mask_expandloadu_ps (__m256 src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vexpandps
__m256 _mm256_maskz_expandloadu_ps (__mmask8 k, void const* mem_addr)

Synopsis

__m256 _mm256_maskz_expandloadu_ps (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F

Description

Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vexpandps
__m512 _mm512_mask_expandloadu_ps (__m512 src, __mmask16 k, void const* mem_addr)

Synopsis

__m512 _mm512_mask_expandloadu_ps (__m512 src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandps zmm {k}, m512
CPUID Flags: AVX512F

Description

Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vexpandps
__m512 _mm512_maskz_expandloadu_ps (__mmask16 k, void const* mem_addr)

Synopsis

__m512 _mm512_maskz_expandloadu_ps (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandps zmm {k}, m512
CPUID Flags: AVX512F

Description

Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_expm1_pd (__m128d a)

Synopsis

__m128d _mm_expm1_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, subtract one from each element, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := e^(a[i+63:i]) - 1.0 ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_expm1_pd (__m256d a)

Synopsis

__m256d _mm256_expm1_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, subtract one from each element, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := e^(a[i+63:i]) - 1.0 ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_expm1_pd (__m512d a)

Synopsis

__m512d _mm512_expm1_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, subtract one from each element, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := e^(a[i+63:i]) - 1.0 ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_expm1_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_expm1_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, subtract one from each element, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := e^(a[i+63:i]) - 1.0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_expm1_ps (__m128 a)

Synopsis

__m128 _mm_expm1_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, subtract one from each element, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := e^(a[i+31:i]) - 1.0 ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_expm1_ps (__m256 a)

Synopsis

__m256 _mm256_expm1_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, subtract one from each element, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := e^(a[i+31:i]) - 1.0 ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_expm1_ps (__m512 a)

Synopsis

__m512 _mm512_expm1_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, subtract one from each element, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := e^(a[i+31:i]) - 1.0 ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_expm1_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_expm1_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, subtract one from each element, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := e^(a[i+31:i]) - 1.0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmovdqa32, vbroadcasti32x4, vpbroadcastd
__m512i _mm512_extload_epi32 (void const * mt, _MM_UPCONV_EPI32_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)

Synopsis

__m512i _mm512_extload_epi32 (void const * mt, _MM_UPCONV_EPI32_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, m512
             vbroadcasti32x4 zmm {k}, m512
             vpbroadcastd zmm {k}, m512
CPUID Flags: KNCNI

Description

Depending on bc, loads 1, 4, or 16 elements of type and size determined by conv from memory address mt and converts all elements to 32-bit integer elements, storing the results in dst. hint indicates to the processor whether the data is non-temporal.

Operation

addr = MEM[mt] FOR j := 0 to 15 i := j*32 CASE bc OF _MM_BROADCAST32_NONE: CASE conv OF _MM_UPCONV_EPI32_NONE: n := j*32 dst[i+31:i] := addr[n+31:n] _MM_UPCONV_EPI32_UINT8: n := j*8 dst[i+31:i] := UInt8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_SINT8: n := j*8 dst[i+31:i] := SInt8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_UINT16: n := j*16 dst[i+31:i] := UInt16ToInt32(addr[n+15:n]) _MM_UPCONV_EPI32_SINT16: n := j*16 dst[i+31:i] := SInt16ToInt32(addr[n+15:n]) ESAC _MM_BROADCAST_1X16: CASE conv OF _MM_UPCONV_EPI32_NONE: n := j*32 dst[i+31:i] := addr[31:0] _MM_UPCONV_EPI32_UINT8: n := j*8 dst[i+31:i] := UInt8ToInt32(addr[7:0]) _MM_UPCONV_EPI32_SINT8: n := j*8 dst[i+31:i] := SInt8ToInt32(addr[7:0]) _MM_UPCONV_EPI32_UINT16: n := j*16 dst[i+31:i] := UInt16ToInt32(addr[15:0]) _MM_UPCONV_EPI32_SINT16: n := j*16 dst[i+31:i] := SInt16ToInt32(addr[15:0]) ESAC _MM_BROADCAST_4X16: mod := j%4 CASE conv OF _MM_UPCONV_EPI32_NONE: n := mod*32 dst[i+31:i] := addr[n+31:n] _MM_UPCONV_EPI32_UINT8: n := mod*8 dst[i+31:i] := UInt8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_SINT8: n := mod*8 dst[i+31:i] := SInt8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_UINT16: n := mod*16 dst[i+31:i] := UInt16ToInt32(addr[n+15:n]) _MM_UPCONV_EPI32_SINT16: n := mod*16 dst[i+31:i] := SInt16ToInt32(addr[n+15:n]) ESAC ESAC ENDFOR dst[MAX:512] := 0
vmovdqa32, vbroadcasti32x4, vpbroadcastd
__m512i _mm512_mask_extload_epi32 (__m512i src, __mmask16 k, void const * mt, _MM_UPCONV_EPI32_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)

Synopsis

__m512i _mm512_mask_extload_epi32 (__m512i src, __mmask16 k, void const * mt, _MM_UPCONV_EPI32_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, m512
             vbroadcasti32x4 zmm {k}, m512
             vpbroadcastd zmm {k}, m512
CPUID Flags: KNCNI

Description

Depending on bc, loads 1, 4, or 16 elements of type and size determined by conv from memory address mt and converts all elements to 32-bit integer elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.

Operation

addr = MEM[mt] FOR j := 0 to 15 i := j*32 IF k[j] CASE bc OF _MM_BROADCAST32_NONE: CASE conv OF _MM_UPCONV_EPI32_NONE: n := j*32 dst[i+31:i] := addr[n+31:n] _MM_UPCONV_EPI32_UINT8: n := j*8 dst[i+31:i] := UInt8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_SINT8: n := j*8 dst[i+31:i] := SInt8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_UINT16: n := j*16 dst[i+31:i] := UInt16ToInt32(addr[n+15:n]) _MM_UPCONV_EPI32_SINT16: n := j*16 dst[i+31:i] := SInt16ToInt32(addr[n+15:n]) ESAC _MM_BROADCAST_1X16: CASE conv OF _MM_UPCONV_EPI32_NONE: n := j*32 dst[i+31:i] := addr[31:0] _MM_UPCONV_EPI32_UINT8: n := j*8 dst[i+31:i] := UInt8ToInt32(addr[7:0]) _MM_UPCONV_EPI32_SINT8: n := j*8 dst[i+31:i] := SInt8ToInt32(addr[7:0]) _MM_UPCONV_EPI32_UINT16: n := j*16 dst[i+31:i] := UInt16ToInt32(addr[15:0]) _MM_UPCONV_EPI32_SINT16: n := j*16 dst[i+31:i] := SInt16ToInt32(addr[15:0]) ESAC _MM_BROADCAST_4X16: mod := j%4 CASE conv OF _MM_UPCONV_EPI32_NONE: n := mod*32 dst[i+31:i] := addr[n+31:n] _MM_UPCONV_EPI32_UINT8: n := mod*8 dst[i+31:i] := UInt8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_SINT8: n := mod*8 dst[i+31:i] := SInt8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_UINT16: n := mod*16 dst[i+31:i] := UInt16ToInt32(addr[n+15:n]) _MM_UPCONV_EPI32_SINT16: n := mod*16 dst[i+31:i] := SInt16ToInt32(addr[n+15:n]) ESAC ESAC ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmovdqa64, vbroadcasti64x4, vpbroadcastq
__m512i _mm512_extload_epi64 (void const * mt, _MM_UPCONV_EPI64_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)

Synopsis

__m512i _mm512_extload_epi64 (void const * mt, _MM_UPCONV_EPI64_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovdqa64 zmm {k}, m512
             vbroadcasti64x4 zmm {k}, m512
             vpbroadcastq zmm {k}, m512
CPUID Flags: KNCNI

Description

Depending on bc, loads 1, 4, or 8 elements of type and size determined by conv from memory address mt and converts all elements to 64-bit integer elements, storing the results in dst. hint indicates to the processor whether the data is non-temporal.

Operation

addr = MEM[mt] FOR j := 0 to 7 i := j*64 CASE bc OF _MM_BROADCAST64_NONE: CASE conv OF _MM_UPCONV_EPI64_NONE: n := j*64 dst[i+63:i] := addr[n+63:n] ESAC _MM_BROADCAST_1X8: CASE conv OF _MM_UPCONV_EPI64_NONE: n := j*64 dst[i+63:i] := addr[63:0] ESAC _MM_BROADCAST_4X8: mod := j%4 CASE conv OF _MM_UPCONV_EPI64_NONE: n := mod*64 dst[i+63:i] := addr[n+63:n] ESAC ESAC ENDFOR dst[MAX:512] := 0
vmovdqa64, vbroadcasti64x4, vpbroadcastq
__m512i _mm512_mask_extload_epi64 (__m512i src, __mmask8 k, void const * mt, _MM_UPCONV_EPI64_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)

Synopsis

__m512i _mm512_mask_extload_epi64 (__m512i src, __mmask8 k, void const * mt, _MM_UPCONV_EPI64_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovdqa64 m512 {k}, zmm
             vbroadcasti64x4 zmm {k}, m512
             vpbroadcastq zmm {k}, m512
CPUID Flags: KNCNI

Description

Depending on bc, loads 1, 4, or 8 elements of type and size determined by conv from memory address mt and converts all elements to 64-bit integer elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.

Operation

addr = MEM[mt] FOR j := 0 to 7 i := j*64 IF k[j] CASE bc OF _MM_BROADCAST64_NONE: CASE conv OF _MM_UPCONV_EPI64_NONE: n := j*64 dst[i+63:i] := addr[n+63:n] ESAC _MM_BROADCAST_1X8: CASE conv OF _MM_UPCONV_EPI64_NONE: n := j*64 dst[i+63:i] := addr[63:0] ESAC _MM_BROADCAST_4X8: mod := j%4 CASE conv OF _MM_UPCONV_EPI64_NONE: n := mod*64 dst[i+63:i] := addr[n+63:n] ESAC ESAC ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmovapd, vbroadcastf64x4, vbroadcastsd
__m512d _mm512_extload_pd (void const * mt, _MM_UPCONV_PD_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)

Synopsis

__m512d _mm512_extload_pd (void const * mt, _MM_UPCONV_PD_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, m512
             vbroadcastf64x4 zmm {k}, m512
             vbroadcastsd zmm {k}, m512
CPUID Flags: KNCNI

Description

Depending on bc, loads 1, 4, or 8 elements of type and size determined by conv from memory address mt and converts all elements to double-precision (64-bit) floating-point elements, storing the results in dst. hint indicates to the processor whether the data is non-temporal.

Operation

addr = MEM[mt] FOR j := 0 to 7 i := j*64 CASE bc OF _MM_BROADCAST64_NONE: CASE conv OF _MM_UPCONV_PD_NONE: n := j*64 dst[i+63:i] := addr[n+63:n] ESAC _MM_BROADCAST_1X8: CASE conv OF _MM_UPCONV_PD_NONE: n := j*64 dst[i+63:i] := addr[63:0] ESAC _MM_BROADCAST_4X8: mod := j%4 CASE conv OF _MM_UPCONV_PD_NONE: n := mod*64 dst[i+63:i] := addr[n+63:n] ESAC ESAC ENDFOR dst[MAX:512] := 0
vmovapd, vbroadcastf64x4, vbroadcastsd
__m512d _mm512_mask_extload_pd (__m512d src, __mmask8 k, void const * mt, _MM_UPCONV_PD_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)

Synopsis

__m512d _mm512_mask_extload_pd (__m512d src, __mmask8 k, void const * mt, _MM_UPCONV_PD_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, m512
             vbroadcastf64x4 zmm {k}, m512
             vbroadcastsd zmm {k}, m512
CPUID Flags: KNCNI

Description

Depending on bc, loads 1, 4, or 8 elements of type and size determined by conv from memory address mt and converts all elements to double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.

Operation

addr = MEM[mt] FOR j := 0 to 7 i := j*64 IF k[j] CASE bc OF _MM_BROADCAST64_NONE: CASE conv OF _MM_UPCONV_PD_NONE: n := j*64 dst[i+63:i] := addr[n+63:n] ESAC _MM_BROADCAST_1X8: CASE conv OF _MM_UPCONV_PD_NONE: n := j*64 dst[i+63:i] := addr[63:0] ESAC _MM_BROADCAST_4X8: mod := j%4 CASE conv OF _MM_UPCONV_PD_NONE: n := mod*64 dst[i+63:i] := addr[n+63:n] ESAC ESAC ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmovaps, vbroadcastf32x4, vbroadcastss
__m512 _mm512_extload_ps (void const * mt, _MM_UPCONV_PS_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)

Synopsis

__m512 _mm512_extload_ps (void const * mt, _MM_UPCONV_PS_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, m512
             vbroadcastf32x4 zmm {k}, m512
             vbroadcastss zmm {k}, m512
CPUID Flags: KNCNI

Description

Depending on bc, loads 1, 4, or 16 elements of type and size determined by conv from memory address mt and converts all elements to single-precision (32-bit) floating-point elements, storing the results in dst. hint indicates to the processor whether the data is non-temporal.

Operation

addr = MEM[mt] FOR j := 0 to 15 i := j*32 CASE bc OF _MM_BROADCAST32_NONE: CASE conv OF _MM_UPCONV_PS_NONE: n := j*32 dst[i+31:i] := addr[n+31:n] _MM_UPCONV_PS_FLOAT16: n := j*16 dst[i+31:i] := Float16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_UINT8: n := j*8 dst[i+31:i] := UInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_SINT8: n := j*8 dst[i+31:i] := SInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_UINT16: n := j*16 dst[i+31:i] := UInt16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_SINT16: n := j*16 dst[i+31:i] := SInt16ToFloat32(addr[n+15:n]) ESAC _MM_BROADCAST_1X16: CASE conv OF _MM_UPCONV_PS_NONE: n := j*32 dst[i+31:i] := addr[31:0] _MM_UPCONV_PS_FLOAT16: n := j*16 dst[i+31:i] := Float16ToFloat32(addr[15:0]) _MM_UPCONV_PS_UINT8: n := j*8 dst[i+31:i] := UInt8ToFloat32(addr[7:0]) _MM_UPCONV_PS_SINT8: n := j*8 dst[i+31:i] := SInt8ToFloat32(addr[7:0]) _MM_UPCONV_PS_UINT16: n := j*16 dst[i+31:i] := UInt16ToFloat32(addr[15:0]) _MM_UPCONV_PS_SINT16: n := j*16 dst[i+31:i] := SInt16ToFloat32(addr[15:0]) ESAC _MM_BROADCAST_4X16: mod := j%4 CASE conv OF _MM_UPCONV_PS_NONE: n := mod*32 dst[i+31:i] := addr[n+31:n] _MM_UPCONV_PS_FLOAT16: n := mod*16 dst[i+31:i] := Float16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_UINT8: n := mod*8 dst[i+31:i] := UInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_SINT8: n := mod*8 dst[i+31:i] := SInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_UINT16: n := mod*16 dst[i+31:i] := UInt16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_SINT16: n := mod*16 dst[i+31:i] := SInt16ToFloat32(addr[n+15:n]) ESAC ESAC ENDFOR dst[MAX:512] := 0
vmovaps, vbroadcastf32x4, vbroadcastss
__m512 _mm512_mask_extload_ps (__m512 src, __mmask16 k, void const * mt, _MM_UPCONV_PS_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)

Synopsis

__m512 _mm512_mask_extload_ps (__m512 src, __mmask16 k, void const * mt, _MM_UPCONV_PS_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, m512
             vbroadcastf32x4 zmm {k}, m512
             vbroadcastss zmm {k}, m512
CPUID Flags: KNCNI

Description

Depending on bc, loads 1, 4, or 16 elements of type and size determined by conv from memory address mt and converts all elements to single-precision (32-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.

Operation

addr = MEM[mt] FOR j := 0 to 15 i := j*32 IF k[j] CASE bc OF _MM_BROADCAST32_NONE: CASE conv OF _MM_UPCONV_PS_NONE: n := j*32 dst[i+31:i] := addr[n+31:n] _MM_UPCONV_PS_FLOAT16: n := j*16 dst[i+31:i] := Float16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_UINT8: n := j*8 dst[i+31:i] := UInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_SINT8: n := j*8 dst[i+31:i] := SInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_UINT16: n := j*16 dst[i+31:i] := UInt16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_SINT16: n := j*16 dst[i+31:i] := SInt16ToFloat32(addr[n+15:n]) ESAC _MM_BROADCAST_1X16: CASE conv OF _MM_UPCONV_PS_NONE: n := j*32 dst[i+31:i] := addr[31:0] _MM_UPCONV_PS_FLOAT16: n := j*16 dst[i+31:i] := Float16ToFloat32(addr[15:0]) _MM_UPCONV_PS_UINT8: n := j*8 dst[i+31:i] := UInt8ToFloat32(addr[7:0]) _MM_UPCONV_PS_SINT8: n := j*8 dst[i+31:i] := SInt8ToFloat32(addr[7:0]) _MM_UPCONV_PS_UINT16: n := j*16 dst[i+31:i] := UInt16ToFloat32(addr[15:0]) _MM_UPCONV_PS_SINT16: n := j*16 dst[i+31:i] := SInt16ToFloat32(addr[15:0]) ESAC _MM_BROADCAST_4X16: mod := j%4 CASE conv OF _MM_UPCONV_PS_NONE: n := mod*32 dst[i+31:i] := addr[n+31:n] _MM_UPCONV_PS_FLOAT16: n := mod*16 dst[i+31:i] := Float16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_UINT8: n := mod*8 dst[i+31:i] := UInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_SINT8: n := mod*8 dst[i+31:i] := SInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_UINT16: n := mod*16 dst[i+31:i] := UInt16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_SINT16: n := mod*16 dst[i+31:i] := SInt16ToFloat32(addr[n+15:n]) ESAC ESAC ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vloadunpackhd
__m512i _mm512_extloadunpackhi_epi32 (__m512i src, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)

Synopsis

__m512i _mm512_extloadunpackhi_epi32 (__m512i src, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhd zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_EPI32_NONE: RETURN MEM[addr + 4*offset] _MM_UPCONV_EPI32_UINT8: RETURN UInt8ToInt32(MEM[addr + offset]) _MM_UPCONV_EPI32_SINT8: RETURN SInt8ToInt32(MEM[addr + offset]) _MM_UPCONV_EPI32_UINT16: RETURN UInt16ToInt32(MEM[addr + 2*offset]) _MM_UPCONV_EPI32_SINT16: RETURN SInt16ToInt32(MEM[addr + 2*offset]) ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_EPI32_NONE: RETURN 4 _MM_UPCONV_EPI32_UINT8: RETURN 1 _MM_UPCONV_EPI32_SINT8: RETURN 1 _MM_UPCONV_EPI32_UINT16: RETURN 2 _MM_UPCONV_EPI32_SINT16: RETURN 2 ESAC } dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false upSize := UPCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 15 IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*upSize % 64) == 0 foundNext64BytesBoundary := true FI ELSE i := j*32 dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) FI loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0
vloadunpackhd
__m512i _mm512_mask_extloadunpackhi_epi32 (__m512i src, __mmask16 k, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)

Synopsis

__m512i _mm512_mask_extloadunpackhi_epi32 (__m512i src, __mmask16 k, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhd zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_EPI32_NONE: RETURN MEM[addr + 4*offset] _MM_UPCONV_EPI32_UINT8: RETURN UInt8ToInt32(MEM[addr + offset]) _MM_UPCONV_EPI32_SINT8: RETURN SInt8ToInt32(MEM[addr + offset]) _MM_UPCONV_EPI32_UINT16: RETURN UInt16ToInt32(MEM[addr + 2*offset]) _MM_UPCONV_EPI32_SINT16: RETURN SInt16ToInt32(MEM[addr + 2*offset]) ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_EPI32_NONE: RETURN 4 _MM_UPCONV_EPI32_UINT8: RETURN 1 _MM_UPCONV_EPI32_SINT8: RETURN 1 _MM_UPCONV_EPI32_UINT16: RETURN 2 _MM_UPCONV_EPI32_SINT16: RETURN 2 ESAC } dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false upSize := UPCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 15 IF k[j] IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*upSize % 64) == 0 foundNext64BytesBoundary := true FI ELSE i := j*32 dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) FI loadOffset := loadOffset + 1 FI ENDFOR dst[MAX:512] := 0
vloadunpackhq
__m512i _mm512_extloadunpackhi_epi64 (__m512i src, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)

Synopsis

__m512i _mm512_extloadunpackhi_epi64 (__m512i src, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhq zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_EPI64_NONE: RETURN MEM[addr + 8*offset] ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_EPI64_NONE: RETURN 8 ESAC } dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false upSize := UPCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 7 IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*upSize) == 0 foundNext64BytesBoundary := true FI ELSE i := j*64 dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) FI loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0
vloadunpackhq
__m512i _mm512_mask_extloadunpackhi_epi64 (__m512i src, __mmask8 k, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)

Synopsis

__m512i _mm512_mask_extloadunpackhi_epi64 (__m512i src, __mmask8 k, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhq zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_EPI64_NONE: RETURN MEM[addr + 8*offset] ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_EPI64_NONE: RETURN 8 ESAC } dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false upSize := UPCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 7 IF k[j] IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*upSize) == 0 foundNext64BytesBoundary := true FI ELSE i := j*64 dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) FI loadOffset := loadOffset + 1 FI ENDFOR dst[MAX:512] := 0
vloadunpackhpd
__m512d _mm512_extloadunpackhi_pd (__m512d src, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)

Synopsis

__m512d _mm512_extloadunpackhi_pd (__m512d src, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhpd zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed double-precision (64-bit) floating-point values in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_PD_NONE: RETURN MEM[addr + 8*offset] ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_PD_NONE: RETURN 8 ESAC } dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false upSize := UPCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 7 IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*upSize) % 64 == 0 foundNext64BytesBoundary := true FI ELSE i := j*64 dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) FI loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0
vloadunpackhpd
__m512d _mm512_mask_extloadunpackhi_pd (__m512d src, __mmask8 k, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)

Synopsis

__m512d _mm512_mask_extloadunpackhi_pd (__m512d src, __mmask8 k, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhpd zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed double-precision (64-bit) floating-point values in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_PD_NONE: RETURN MEM[addr + 8*offset] ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_PD_NONE: RETURN 8 ESAC } dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false upSize := UPCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 7 IF k[j] IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*upSize) % 64 == 0 foundNext64BytesBoundary := true FI ELSE i := j*64 dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) FI loadOffset := loadOffset + 1 FI ENDFOR dst[MAX:512] := 0
vloadunpackhps
__m512 _mm512_extloadunpackhi_ps (__m512 src, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)

Synopsis

__m512 _mm512_extloadunpackhi_ps (__m512 src, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhps zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_PS_NONE: RETURN MEM[addr + 4*offset] _MM_UPCONV_PS_FLOAT16: RETURN Float16ToFloat32(MEM[addr + 4*offset]) _MM_UPCONV_PS_UINT8: RETURN UInt8ToFloat32(MEM[addr + offset]) _MM_UPCONV_PS_SINT8: RETURN SInt8ToFloat32(MEM[addr + offset]) _MM_UPCONV_PS_UINT16: RETURN UInt16ToFloat32(MEM[addr + 2*offset]) _MM_UPCONV_PS_SINT16: RETURN SInt16ToFloat32(MEM[addr + 2*offset]) ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_PS_NONE: RETURN 4 _MM_UPCONV_PS_FLOAT16: RETURN 2 _MM_UPCONV_PS_UINT8: RETURN 1 _MM_UPCONV_PS_SINT8: RETURN 1 _MM_UPCONV_PS_UINT16: RETURN 2 _MM_UPCONV_PS_SINT16: RETURN 2 ESAC } dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false upSize := UPCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 15 IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*upSize % 64) == 0 foundNext64BytesBoundary := true FI ELSE i := j*32 dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) FI loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0
vloadunpackhps
__m512 _mm512_mask_extloadunpackhi_ps (__m512 src, __mmask16 k, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)

Synopsis

__m512 _mm512_mask_extloadunpackhi_ps (__m512 src, __mmask16 k, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhps zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_PS_NONE: RETURN MEM[addr + 4*offset] _MM_UPCONV_PS_FLOAT16: RETURN Float16ToFloat32(MEM[addr + 4*offset]) _MM_UPCONV_PS_UINT8: RETURN UInt8ToFloat32(MEM[addr + offset]) _MM_UPCONV_PS_SINT8: RETURN SInt8ToFloat32(MEM[addr + offset]) _MM_UPCONV_PS_UINT16: RETURN UInt16ToFloat32(MEM[addr + 2*offset]) _MM_UPCONV_PS_SINT16: RETURN SInt16ToFloat32(MEM[addr + 2*offset]) ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_PS_NONE: RETURN 4 _MM_UPCONV_PS_FLOAT16: RETURN 2 _MM_UPCONV_PS_UINT8: RETURN 1 _MM_UPCONV_PS_SINT8: RETURN 1 _MM_UPCONV_PS_UINT16: RETURN 2 _MM_UPCONV_PS_SINT16: RETURN 2 ESAC } dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false upSize := UPCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 15 IF k[j] IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*upSize % 64) == 0 foundNext64BytesBoundary := true FI ELSE i := j*32 dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) FI loadOffset := loadOffset + 1 FI ENDFOR dst[MAX:512] := 0
vloadunpackld
__m512i _mm512_extloadunpacklo_epi32 (__m512i src, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)

Synopsis

__m512i _mm512_extloadunpacklo_epi32 (__m512i src, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackld zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_EPI32_NONE: RETURN MEM[addr + 4*offset] _MM_UPCONV_EPI32_UINT8: RETURN UInt8ToInt32(MEM[addr + offset]) _MM_UPCONV_EPI32_SINT8: RETURN SInt8ToInt32(MEM[addr + offset]) _MM_UPCONV_EPI32_UINT16: RETURN UInt16ToInt32(MEM[addr + 2*offset]) _MM_UPCONV_EPI32_SINT16: RETURN SInt16ToInt32(MEM[addr + 2*offset]) ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_EPI32_NONE: RETURN 4 _MM_UPCONV_EPI32_UINT8: RETURN 1 _MM_UPCONV_EPI32_SINT8: RETURN 1 _MM_UPCONV_EPI32_UINT16: RETURN 2 _MM_UPCONV_EPI32_SINT16: RETURN 2 ESAC } dst[511:0] := src[511:0] loadOffset := 0 upSize := UPCONVERTSIZE(conv) addr = mt FOR j := 0 to 15 i := j*32 dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) loadOffset := loadOffset + 1 IF (mt + loadOffset * upSize) % 64 == 0 break FI ENDFOR dst[MAX:512] := 0
vloadunpackld
__m512i _mm512_mask_extloadunpacklo_epi32 (__m512i src, __mmask16 k, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)

Synopsis

__m512i _mm512_mask_extloadunpacklo_epi32 (__m512i src, __mmask16 k, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackld zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_EPI32_NONE: RETURN MEM[addr + 4*offset] _MM_UPCONV_EPI32_UINT8: RETURN UInt8ToInt32(MEM[addr + offset]) _MM_UPCONV_EPI32_SINT8: RETURN SInt8ToInt32(MEM[addr + offset]) _MM_UPCONV_EPI32_UINT16: RETURN UInt16ToInt32(MEM[addr + 2*offset]) _MM_UPCONV_EPI32_SINT16: RETURN SInt16ToInt32(MEM[addr + 2*offset]) ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_EPI32_NONE: RETURN 4 _MM_UPCONV_EPI32_UINT8: RETURN 1 _MM_UPCONV_EPI32_SINT8: RETURN 1 _MM_UPCONV_EPI32_UINT16: RETURN 2 _MM_UPCONV_EPI32_SINT16: RETURN 2 ESAC } dst[511:0] := src[511:0] loadOffset := 0 upSize := UPCONVERTSIZE(conv) addr = mt FOR j := 0 to 15 IF k[j] i := j*32 dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) loadOffset := loadOffset + 1 IF (mt + loadOffset * upSize) % 64 == 0 break FI FI ENDFOR dst[MAX:512] := 0
vloadunpacklq
__m512i _mm512_extloadunpacklo_epi64 (__m512i src, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)

Synopsis

__m512i _mm512_extloadunpacklo_epi64 (__m512i src, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpacklq zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_EPI64_NONE: RETURN MEM[addr + 8*offset] ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_EPI64_NONE: RETURN 8 ESAC } dst[511:0] := src[511:0] loadOffset := 0 upSize := UPCONVERTSIZE(conv) addr = mt FOR j := 0 to 7 i := j*64 dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) loadOffset := loadOffset + 1 IF (addr + loadOffset*upSize % 64) == 0 BREAK FI ENDFOR dst[MAX:512] := 0
vloadunpacklq
__m512i _mm512_mask_extloadunpacklo_epi64 (__m512i src, __mmask8 k, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)

Synopsis

__m512i _mm512_mask_extloadunpacklo_epi64 (__m512i src, __mmask8 k, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpacklq zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quadwords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_EPI64_NONE: RETURN MEM[addr + 8*offset] ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_EPI64_NONE: RETURN 8 ESAC } dst[511:0] := src[511:0] loadOffset := 0 upSize := UPCONVERTSIZE(conv) addr = mt FOR j := 0 to 7 IF k[j] i := j*64 dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) loadOffset := loadOffset + 1 IF (addr + loadOffset*upSize % 64) == 0 BREAK FI FI ENDFOR dst[MAX:512] := 0
vloadunpacklpd
__m512d _mm512_extloadunpacklo_pd (__m512d src, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)

Synopsis

__m512d _mm512_extloadunpacklo_pd (__m512d src, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpacklpd zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed double-precision (64-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_PD_NONE: RETURN MEM[addr + 8*offset] ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_PD_NONE: RETURN 8 ESAC } dst[511:0] := src[511:0] loadOffset := 0 upSize := UPCONVERTSIZE(conv) addr = mt FOR j := 0 to 7 i := j*64 dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) loadOffset := loadOffset + 1 IF (mt + loadOffset * upSize) % 64 == 0 break FI ENDFOR dst[MAX:512] := 0
vloadunpacklpd
__m512d _mm512_mask_extloadunpacklo_pd (__m512d src, __mmask8 k, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)

Synopsis

__m512d _mm512_mask_extloadunpacklo_pd (__m512d src, __mmask8 k, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpacklpd zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed double-precision (64-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elemenst are skipped when the corresponding mask bit is not set).

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_PD_NONE: RETURN MEM[addr + 8*offset] ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_PD_NONE: RETURN 8 ESAC } dst[511:0] := src[511:0] loadOffset := 0 upSize := UPCONVERTSIZE(conv) addr = mt FOR j := 0 to 7 IF k[j] i := j*64 dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) loadOffset := loadOffset + 1 IF (mt + loadOffset * upSize) % 64 == 0 break FI FI ENDFOR dst[MAX:512] := 0
vloadunpacklps
__m512 _mm512_extloadunpacklo_ps (__m512 src, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)

Synopsis

__m512 _mm512_extloadunpacklo_ps (__m512 src, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpacklps zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_PS_NONE: RETURN MEM[addr + 4*offset] _MM_UPCONV_PS_FLOAT16: RETURN Float16ToFloat32(MEM[addr + 4*offset]) _MM_UPCONV_PS_UINT8: RETURN UInt8ToFloat32(MEM[addr + offset]) _MM_UPCONV_PS_SINT8: RETURN SInt8ToFloat32(MEM[addr + offset]) _MM_UPCONV_PS_UINT16: RETURN UInt16ToFloat32(MEM[addr + 2*offset]) _MM_UPCONV_PS_SINT16: RETURN SInt16ToFloat32(MEM[addr + 2*offset]) ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_PS_NONE: RETURN 4 _MM_UPCONV_PS_FLOAT16: RETURN 2 _MM_UPCONV_PS_UINT8: RETURN 1 _MM_UPCONV_PS_SINT8: RETURN 1 _MM_UPCONV_PS_UINT16: RETURN 2 _MM_UPCONV_PS_SINT16: RETURN 2 ESAC } dst[511:0] := src[511:0] loadOffset := 0 upSize := UPCONVERTSIZE(conv) addr = MEM[mt] FOR j := 0 to 15 i := j*32 dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) loadOffset := loadOffset + 1 IF (mt + loadOffset * upSize) % 64 == 0 break FI ENDFOR dst[MAX:512] := 0
vloadunpacklps
__m512 _mm512_mask_extloadunpacklo_ps (__m512 src, __mmask16 k, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)

Synopsis

__m512 _mm512_mask_extloadunpacklo_ps (__m512 src, __mmask16 k, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpacklps zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

UPCONVERT(address, offset, convertTo) { CASE conv OF _MM_UPCONV_PS_NONE: RETURN MEM[addr + 4*offset] _MM_UPCONV_PS_FLOAT16: RETURN Float16ToFloat32(MEM[addr + 4*offset]) _MM_UPCONV_PS_UINT8: RETURN UInt8ToFloat32(MEM[addr + offset]) _MM_UPCONV_PS_SINT8: RETURN SInt8ToFloat32(MEM[addr + offset]) _MM_UPCONV_PS_UINT16: RETURN UInt16ToFloat32(MEM[addr + 2*offset]) _MM_UPCONV_PS_SINT16: RETURN SInt16ToFloat32(MEM[addr + 2*offset]) ESAC } UPCONVERTSIZE(convertTo) { CASE conv OF _MM_UPCONV_PS_NONE: RETURN 4 _MM_UPCONV_PS_FLOAT16: RETURN 2 _MM_UPCONV_PS_UINT8: RETURN 1 _MM_UPCONV_PS_SINT8: RETURN 1 _MM_UPCONV_PS_UINT16: RETURN 2 _MM_UPCONV_PS_SINT16: RETURN 2 ESAC } dst[511:0] := src[511:0] loadOffset := 0 upSize := UPCONVERTSIZE(conv) addr = MEM[mt] FOR j := 0 to 15 IF k[j] i := j*32 dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) loadOffset := loadOffset + 1 IF (mt + loadOffset * upSize) % 64 == 0 break FI FI ENDFOR dst[MAX:512] := 0
vpackstorehd
void _mm512_extpackstorehi_epi32 (void * mt, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)

Synopsis

void _mm512_extpackstorehi_epi32 (void * mt, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed 32-bit integer elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal.

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_EPI32_NONE: RETURN element[i+31:i] _MM_UPCONV_EPI32_UINT8: RETURN UInt32ToUInt8(element[i+31:i]) _MM_UPCONV_EPI32_SINT8: RETURN SInt32ToSInt8(element[i+31:i]) _MM_UPCONV_EPI32_UINT16: RETURN UInt32ToUInt16(element[i+31:i]) _MM_UPCONV_EPI32_SINT16: RETURN SInt32ToSInt16(element[i+31:i]) ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_EPI32_NONE: RETURN 4 _MM_UPCONV_EPI32_UINT8: RETURN 1 _MM_UPCONV_EPI32_SINT8: RETURN 1 _MM_UPCONV_EPI32_UINT16: RETURN 2 _MM_UPCONV_EPI32_SINT16: RETURN 2 ESAC } storeOffset := 0 foundNext64BytesBoundary := false downSize := DOWNCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 15 IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*32 tmp := DOWNCONVERT(v1[i+31:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 4: MEM[storeAddr] := tmp[31:0] 2: MEM[storeAddr] := tmp[15:0] 1: MEM[storeAddr] := tmp[7:0] ESAC FI storeOffset := storeOffset + 1 ENDFOR dst[MAX:512] := 0
vpackstorehd
void _mm512_mask_extpackstorehi_epi32 (void * mt, __mmask16 k, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)

Synopsis

void _mm512_mask_extpackstorehi_epi32 (void * mt, __mmask16 k, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed 32-bit integer elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped when the corresonding mask bit is not set).

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_EPI32_NONE: RETURN element[i+31:i] _MM_UPCONV_EPI32_UINT8: RETURN UInt32ToUInt8(element[i+31:i]) _MM_UPCONV_EPI32_SINT8: RETURN SInt32ToSInt8(element[i+31:i]) _MM_UPCONV_EPI32_UINT16: RETURN UInt32ToUInt16(element[i+31:i]) _MM_UPCONV_EPI32_SINT16: RETURN SInt32ToSInt16(element[i+31:i]) ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_EPI32_NONE: RETURN 4 _MM_UPCONV_EPI32_UINT8: RETURN 1 _MM_UPCONV_EPI32_SINT8: RETURN 1 _MM_UPCONV_EPI32_UINT16: RETURN 2 _MM_UPCONV_EPI32_SINT16: RETURN 2 ESAC } storeOffset := 0 foundNext64BytesBoundary := false downSize := DOWNCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 15 IF k[j] IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*32 tmp := DOWNCONVERT(v1[i+31:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 4: MEM[storeAddr] := tmp[31:0] 2: MEM[storeAddr] := tmp[15:0] 1: MEM[storeAddr] := tmp[7:0] ESAC FI storeOffset := storeOffset + 1 FI ENDFOR dst[MAX:512] := 0
vpackstorehq
void _mm512_extpackstorehi_epi64 (void * mt, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)

Synopsis

void _mm512_extpackstorehi_epi64 (void * mt, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehq m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed 64-bit integer elements of v1 into a quadword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal.

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_EPI64_NONE: RETURN element[i+63:i] ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_EPI64_NONE: RETURN 8 ESAC } storeOffset := 0 foundNext64BytesBoundary := false downSize := DOWNCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 7 IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*64 tmp := DOWNCONVERT(v1[i+63:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 8: MEM[storeAddr] := tmp[63:0] ESAC FI storeOffset := storeOffset + 1 ENDFOR dst[MAX:512] := 0
vpackstorehq
void _mm512_mask_extpackstorehi_epi64 (void * mt, __mmask8 k, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)

Synopsis

void _mm512_mask_extpackstorehi_epi64 (void * mt, __mmask8 k, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehq m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed 64-bit integer elements of v1 into a quadword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (mt-64)). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped when the corresonding mask bit is not set).

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_EPI64_NONE: RETURN element[i+63:i] ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_EPI64_NONE: RETURN 8 ESAC } storeOffset := 0 foundNext64BytesBoundary := false downSize := DOWNCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 7 IF k[j] IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*64 tmp := DOWNCONVERT(v1[i+63:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 8: MEM[storeAddr] := tmp[63:0] ESAC FI storeOffset := storeOffset + 1 FI ENDFOR dst[MAX:512] := 0
vpackstorehpd
void _mm512_extpackstorehi_pd (void * mt, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)

Synopsis

void _mm512_extpackstorehi_pd (void * mt, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehpd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal.

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_PD_NONE: RETURN element[i+63:i] ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_PD_NONE: RETURN 8 ESAC } storeOffset := 0 foundNext64BytesBoundary := false downSize := DOWNCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 7 IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*64 tmp := DOWNCONVERT(v1[i+63:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 8: MEM[storeAddr] := tmp[63:0] ESAC FI storeOffset := storeOffset + 1 ENDFOR dst[MAX:512] := 0
vpackstorehpd
void _mm512_mask_extpackstorehi_pd (void * mt, __mmask8 k, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)

Synopsis

void _mm512_mask_extpackstorehi_pd (void * mt, __mmask8 k, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehpd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_PD_NONE: RETURN element[i+63:i] ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_PD_NONE: RETURN 8 ESAC } storeOffset := 0 foundNext64BytesBoundary := false downSize := DOWNCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 7 IF k[j] IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*64 tmp := DOWNCONVERT(v1[i+63:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 8: MEM[storeAddr] := tmp[63:0] ESAC FI storeOffset := storeOffset + 1 FI ENDFOR dst[MAX:512] := 0
vpackstorehps
void _mm512_extpackstorehi_ps (void * mt, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)

Synopsis

void _mm512_extpackstorehi_ps (void * mt, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehps m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed single-precision (32-bit) floating-point elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal.

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_PS_NONE: RETURN element[i+31:i] _MM_UPCONV_PS_FLOAT16: RETURN Float32ToFloat16(element[i+31:i]) _MM_UPCONV_PS_UINT8: RETURN UInt32ToUInt8(element[i+31:i]) _MM_UPCONV_PS_SINT8: RETURN SInt32ToSInt8(element[i+31:i]) _MM_UPCONV_PS_UINT16: RETURN UInt32ToUInt16(element[i+31:i]) _MM_UPCONV_PS_SINT16: RETURN SInt32ToSInt16(element[i+31:i]) ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_PS_NONE: RETURN 4 _MM_UPCONV_PS_FLOAT16: RETURN 2 _MM_UPCONV_PS_UINT8: RETURN 1 _MM_UPCONV_PS_SINT8: RETURN 1 _MM_UPCONV_PS_UINT16: RETURN 2 _MM_UPCONV_PS_SINT16: RETURN 2 ESAC } storeOffset := 0 foundNext64BytesBoundary := false downSize := DOWNCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 15 IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*32 tmp := DOWNCONVERT(v1[i+31:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 4: MEM[storeAddr] := tmp[31:0] 2: MEM[storeAddr] := tmp[15:0] 1: MEM[storeAddr] := tmp[7:0] ESAC FI storeOffset := storeOffset + 1 ENDFOR dst[MAX:512] := 0
vpackstorehps
void _mm512_mask_extpackstorehi_ps (void * mt, __mmask16 k, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)

Synopsis

void _mm512_mask_extpackstorehi_ps (void * mt, __mmask16 k, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehps m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed single-precision (32-bit) floating-point elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_PS_NONE: RETURN element[i+31:i] _MM_UPCONV_PS_FLOAT16: RETURN Float32ToFloat16(element[i+31:i]) _MM_UPCONV_PS_UINT8: RETURN UInt32ToUInt8(element[i+31:i]) _MM_UPCONV_PS_SINT8: RETURN SInt32ToSInt8(element[i+31:i]) _MM_UPCONV_PS_UINT16: RETURN UInt32ToUInt16(element[i+31:i]) _MM_UPCONV_PS_SINT16: RETURN SInt32ToSInt16(element[i+31:i]) ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_PS_NONE: RETURN 4 _MM_UPCONV_PS_FLOAT16: RETURN 2 _MM_UPCONV_PS_UINT8: RETURN 1 _MM_UPCONV_PS_SINT8: RETURN 1 _MM_UPCONV_PS_UINT16: RETURN 2 _MM_UPCONV_PS_SINT16: RETURN 2 ESAC } storeOffset := 0 foundNext64BytesBoundary := false downSize := DOWNCONVERTSIZE(conv) addr = mt-64 FOR j := 0 to 15 IF k[j] IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*32 tmp := DOWNCONVERT(v1[i+31:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 4: MEM[storeAddr] := tmp[31:0] 2: MEM[storeAddr] := tmp[15:0] 1: MEM[storeAddr] := tmp[7:0] ESAC FI storeOffset := storeOffset + 1 FI ENDFOR dst[MAX:512] := 0
vpackstoreld
void _mm512_extpackstorelo_epi32 (void * mt, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)

Synopsis

void _mm512_extpackstorelo_epi32 (void * mt, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstoreld m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed 32-bit integer elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal.

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_EPI32_NONE: RETURN element[i+31:i] _MM_UPCONV_EPI32_UINT8: RETURN UInt32ToUInt8(element[i+31:i]) _MM_UPCONV_EPI32_SINT8: RETURN SInt32ToSInt8(element[i+31:i]) _MM_UPCONV_EPI32_UINT16: RETURN UInt32ToUInt16(element[i+31:i]) _MM_UPCONV_EPI32_SINT16: RETURN SInt32ToSInt16(element[i+31:i]) ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_EPI32_NONE: RETURN 4 _MM_UPCONV_EPI32_UINT8: RETURN 1 _MM_UPCONV_EPI32_SINT8: RETURN 1 _MM_UPCONV_EPI32_UINT16: RETURN 2 _MM_UPCONV_EPI32_SINT16: RETURN 2 ESAC } storeOffset := 0 downSize := DOWNCONVERTSIZE(conv) addr = mt FOR j := 0 to 15 i := j*32 tmp := DOWNCONVERT(v1[i+31:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 4: MEM[storeAddr] := tmp[31:0] 2: MEM[storeAddr] := tmp[15:0] 1: MEM[storeAddr] := tmp[7:0] ESAC storeOffset := storeOffset + 1 IF ((addr + storeOffset * downSize) % 64) == 0 BREAK FI ENDFOR dst[MAX:512] := 0
vpackstoreld
void _mm512_mask_extpackstorelo_epi32 (void * mt, __mmask16 k, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)

Synopsis

void _mm512_mask_extpackstorelo_epi32 (void * mt, __mmask16 k, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstoreld m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed 32-bit integer elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal. Elements are written to memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_EPI32_NONE: RETURN element[i+31:i] _MM_UPCONV_EPI32_UINT8: RETURN UInt32ToUInt8(element[i+31:i]) _MM_UPCONV_EPI32_SINT8: RETURN SInt32ToSInt8(element[i+31:i]) _MM_UPCONV_EPI32_UINT16: RETURN UInt32ToUInt16(element[i+31:i]) _MM_UPCONV_EPI32_SINT16: RETURN SInt32ToSInt16(element[i+31:i]) ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_EPI32_NONE: RETURN 4 _MM_UPCONV_EPI32_UINT8: RETURN 1 _MM_UPCONV_EPI32_SINT8: RETURN 1 _MM_UPCONV_EPI32_UINT16: RETURN 2 _MM_UPCONV_EPI32_SINT16: RETURN 2 ESAC } storeOffset := 0 downSize := DOWNCONVERTSIZE(conv) addr = mt FOR j := 0 to 15 IF k[j] i := j*32 tmp := DOWNCONVERT(v1[i+31:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 4: MEM[storeAddr] := tmp[31:0] 2: MEM[storeAddr] := tmp[15:0] 1: MEM[storeAddr] := tmp[7:0] ESAC storeOffset := storeOffset + 1 IF ((addr + storeOffset * downSize) % 64) == 0 BREAK FI FI ENDFOR dst[MAX:512] := 0
vpackstorelq
void _mm512_extpackstorelo_epi64 (void * mt, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)

Synopsis

void _mm512_extpackstorelo_epi64 (void * mt, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorelq m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed 64-bit integer elements of v1 into a quadword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal.

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_EPI64_NONE: RETURN element[i+63:i] ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_EPI64_NONE: RETURN 8 ESAC } storeOffset := 0 downSize := DOWNCONVERTSIZE(conv) addr = mt FOR j := 0 to 7 i := j*63 tmp := DOWNCONVERT(v1[i+63:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 8: MEM[storeAddr] := tmp[63:0] ESAC storeOffset := storeOffset + 1 IF ((addr + storeOffset * downSize) % 64) == 0 BREAK FI ENDFOR dst[MAX:512] := 0
vpackstorelq
void _mm512_mask_extpackstorelo_epi64 (void * mt, __mmask8 k, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)

Synopsis

void _mm512_mask_extpackstorelo_epi64 (void * mt, __mmask8 k, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorelq m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed 64-bit integer elements of v1 into a quadword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped whent he corresponding mask bit is not set).

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_EPI64_NONE: RETURN element[i+63:i] ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_EPI64_NONE: RETURN 8 ESAC } storeOffset := 0 downSize := DOWNCONVERTSIZE(conv) addr = mt FOR j := 0 to 7 IF k[j] i := j*63 tmp := DOWNCONVERT(v1[i+63:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 8: MEM[storeAddr] := tmp[63:0] ESAC storeOffset := storeOffset + 1 IF ((addr + storeOffset * downSize) % 64) == 0 BREAK FI FI ENDFOR dst[MAX:512] := 0
vpackstorelpd
void _mm512_extpackstorelo_pd (void * mt, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)

Synopsis

void _mm512_extpackstorelo_pd (void * mt, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorelpd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal.

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_PD_NONE: RETURN element[i+63:i] ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_PD_NONE: RETURN 8 ESAC } storeOffset := 0 downSize := DOWNCONVERTSIZE(conv) addr = mt FOR j := 0 to 7 i := j*63 tmp := DOWNCONVERT(v1[i+63:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 8: MEM[storeAddr] := tmp[63:0] ESAC storeOffset := storeOffset + 1 IF ((addr + storeOffset * downSize) % 64) == 0 BREAK FI ENDFOR dst[MAX:512] := 0
vpackstorelpd
void _mm512_mask_extpackstorelo_pd (void * mt, __mmask8 k, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)

Synopsis

void _mm512_mask_extpackstorelo_pd (void * mt, __mmask8 k, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorelpd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_PD_NONE: RETURN element[i+63:i] ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_PD_NONE: RETURN 8 ESAC } storeOffset := 0 downSize := DOWNCONVERTSIZE(conv) addr = mt FOR j := 0 to 7 IF k[j] i := j*63 tmp := DOWNCONVERT(v1[i+63:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 8: MEM[storeAddr] := tmp[63:0] ESAC storeOffset := storeOffset + 1 IF ((addr + storeOffset * downSize) % 64) == 0 BREAK FI FI ENDFOR dst[MAX:512] := 0
vpackstorelps
void _mm512_extpackstorelo_ps (void * mt, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)

Synopsis

void _mm512_extpackstorelo_ps (void * mt, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorelps m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed single-precision (32-bit) floating-point elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal.

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_PS_NONE: RETURN element[i+31:i] _MM_UPCONV_PS_FLOAT16: RETURN Float32ToFloat16(element[i+31:i]) _MM_UPCONV_PS_UINT8: RETURN UInt32ToUInt8(element[i+31:i]) _MM_UPCONV_PS_SINT8: RETURN SInt32ToSInt8(element[i+31:i]) _MM_UPCONV_PS_UINT16: RETURN UInt32ToUInt16(element[i+31:i]) _MM_UPCONV_PS_SINT16: RETURN SInt32ToSInt16(element[i+31:i]) ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_PS_NONE: RETURN 4 _MM_UPCONV_PS_FLOAT16: RETURN 2 _MM_UPCONV_PS_UINT8: RETURN 1 _MM_UPCONV_PS_SINT8: RETURN 1 _MM_UPCONV_PS_UINT16: RETURN 2 _MM_UPCONV_PS_SINT16: RETURN 2 ESAC } storeOffset := 0 downSize := DOWNCONVERTSIZE(conv) addr = mt FOR j := 0 to 15 i := j*32 tmp := DOWNCONVERT(v1[i+31:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 4: MEM[storeAddr] := tmp[31:0] 2: MEM[storeAddr] := tmp[15:0] 1: MEM[storeAddr] := tmp[7:0] ESAC storeOffset := storeOffset + 1 IF ((addr + storeOffset * downSize) % 64) == 0 BREAK FI ENDFOR dst[MAX:512] := 0
vpackstorelps
void _mm512_mask_extpackstorelo_ps (void * mt, __mmask16 k, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)

Synopsis

void _mm512_mask_extpackstorelo_ps (void * mt, __mmask16 k, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorelps m512 {k}, zmm
CPUID Flags: KNCNI

Description

Down-converts and stores packed single-precision (32-bit) floating-point elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

DOWNCONVERT(element, convertTo) { CASE converTo OF _MM_UPCONV_PS_NONE: RETURN element[i+31:i] _MM_UPCONV_PS_FLOAT16: RETURN Float32ToFloat16(element[i+31:i]) _MM_UPCONV_PS_UINT8: RETURN UInt32ToUInt8(element[i+31:i]) _MM_UPCONV_PS_SINT8: RETURN SInt32ToSInt8(element[i+31:i]) _MM_UPCONV_PS_UINT16: RETURN UInt32ToUInt16(element[i+31:i]) _MM_UPCONV_PS_SINT16: RETURN SInt32ToSInt16(element[i+31:i]) ESAC } DOWNCONVERTSIZE(convertTo) { CASE converTo OF _MM_UPCONV_PS_NONE: RETURN 4 _MM_UPCONV_PS_FLOAT16: RETURN 2 _MM_UPCONV_PS_UINT8: RETURN 1 _MM_UPCONV_PS_SINT8: RETURN 1 _MM_UPCONV_PS_UINT16: RETURN 2 _MM_UPCONV_PS_SINT16: RETURN 2 ESAC } storeOffset := 0 downSize := DOWNCONVERTSIZE(conv) addr = mt FOR j := 0 to 15 IF k[j] i := j*32 tmp := DOWNCONVERT(v1[i+31:i], conv) storeAddr := addr + storeOffset * downSize CASE downSize OF 4: MEM[storeAddr] := tmp[31:0] 2: MEM[storeAddr] := tmp[15:0] 1: MEM[storeAddr] := tmp[7:0] ESAC storeOffset := storeOffset + 1 IF ((addr + storeOffset * downSize) % 64) == 0 BREAK FI FI ENDFOR dst[MAX:512] := 0
pextrw
int _mm_extract_epi16 (__m128i a, int imm8)

Synopsis

int _mm_extract_epi16 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pextrw r32, xmm, imm
CPUID Flags: SSE2

Description

Extract a 16-bit integer from a, selected with imm8, and store the result in the lower element of dst.

Operation

dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0] dst[31:16] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
...
__int16 _mm256_extract_epi16 (__m256i a, const int index)

Synopsis

__int16 _mm256_extract_epi16 (__m256i a, const int index)
#include "immintrin.h"
CPUID Flags: AVX

Description

Extract a 16-bit integer from a, selected with index, and store the result in dst.

Operation

dst[15:0] := (a[255:0] >> (index * 16))[15:0]
pextrd
int _mm_extract_epi32 (__m128i a, const int imm8)

Synopsis

int _mm_extract_epi32 (__m128i a, const int imm8)
#include "smmintrin.h"
Instruction: pextrd r32, xmm, imm
CPUID Flags: SSE4.1

Description

Extract a 32-bit integer from a, selected with imm8, and store the result in dst.

Operation

dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0]

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
...
__int32 _mm256_extract_epi32 (__m256i a, const int index)

Synopsis

__int32 _mm256_extract_epi32 (__m256i a, const int index)
#include "immintrin.h"
CPUID Flags: AVX

Description

Extract a 32-bit integer from a, selected with index, and store the result in dst.

Operation

dst[31:0] := (a[255:0] >> (index * 32))[31:0]
pextrq
__int64 _mm_extract_epi64 (__m128i a, const int imm8)

Synopsis

__int64 _mm_extract_epi64 (__m128i a, const int imm8)
#include "smmintrin.h"
Instruction: pextrq r64, xmm, imm
CPUID Flags: SSE4.1

Description

Extract a 64-bit integer from a, selected with imm8, and store the result in dst.

Operation

dst[63:0] := (a[127:0] >> (imm8[0] * 64))[63:0]

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
...
__int64 _mm256_extract_epi64 (__m256i a, const int index)

Synopsis

__int64 _mm256_extract_epi64 (__m256i a, const int index)
#include "immintrin.h"
CPUID Flags: AVX

Description

Extract a 64-bit integer from a, selected with index, and store the result in dst.

Operation

dst[63:0] := (a[255:0] >> (index * 64))[63:0]
pextrb
int _mm_extract_epi8 (__m128i a, const int imm8)

Synopsis

int _mm_extract_epi8 (__m128i a, const int imm8)
#include "smmintrin.h"
Instruction: pextrb r32, xmm, imm
CPUID Flags: SSE4.1

Description

Extract an 8-bit integer from a, selected with imm8, and store the result in the lower element of dst.

Operation

dst[7:0] := (a[127:0] >> (imm8[3:0] * 8))[7:0] dst[31:8] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
...
__int8 _mm256_extract_epi8 (__m256i a, const int index)

Synopsis

__int8 _mm256_extract_epi8 (__m256i a, const int index)
#include "immintrin.h"
CPUID Flags: AVX

Description

Extract an 8-bit integer from a, selected with index, and store the result in dst.

Operation

dst[7:0] := (a[255:0] >> (index * 8))[7:0]
pextrw
int _mm_extract_pi16 (__m64 a, int imm8)

Synopsis

int _mm_extract_pi16 (__m64 a, int imm8)
#include "xmmintrin.h"
Instruction: pextrw r32, mm, imm
CPUID Flags: SSE

Description

Extract a 16-bit integer from a, selected with imm8, and store the result in the lower element of dst.

Operation

dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0] dst[31:16] := 0
extractps
int _mm_extract_ps (__m128 a, const int imm8)

Synopsis

int _mm_extract_ps (__m128 a, const int imm8)
#include "smmintrin.h"
Instruction: extractps r32, xmm, imm
CPUID Flags: SSE4.1

Description

Extract a single-precision (32-bit) floating-point element from a, selected with imm8, and store the result in dst.

Operation

dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0]

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere2-
Nehalem2-
vextractf128
__m128d _mm256_extractf128_pd (__m256d a, const int imm8)

Synopsis

__m128d _mm256_extractf128_pd (__m256d a, const int imm8)
#include "immintrin.h"
Instruction: vextractf128 xmm, ymm, imm
CPUID Flags: AVX

Description

Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vextractf128
__m128 _mm256_extractf128_ps (__m256 a, const int imm8)

Synopsis

__m128 _mm256_extractf128_ps (__m256 a, const int imm8)
#include "immintrin.h"
Instruction: vextractf128 xmm, ymm, imm
CPUID Flags: AVX

Description

Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vextractf128
__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)

Synopsis

__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vextractf128 xmm, ymm, imm
CPUID Flags: AVX

Description

Extract 128 bits (composed of integer data) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vextractf32x4
__m128 _mm256_extractf32x4_ps (__m256 a, int imm8)

Synopsis

__m128 _mm256_extractf32x4_ps (__m256 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x4
CPUID Flags: AVX512VL + AVX512F

Description

Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC dst[MAX:128] := 0
vextractf32x4
__m128 _mm256_mask_extractf32x4_ps (__m128 src, __mmask8 k, __m256 a, int imm8)

Synopsis

__m128 _mm256_mask_extractf32x4_ps (__m128 src, __mmask8 k, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x4
CPUID Flags: AVX512VL + AVX512F

Description

Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vextractf32x4
__m128 _mm256_maskz_extractf32x4_ps (__mmask8 k, __m256 a, int imm8)

Synopsis

__m128 _mm256_maskz_extractf32x4_ps (__mmask8 k, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x4
CPUID Flags: AVX512VL + AVX512F

Description

Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vextractf32x4
__m128 _mm512_extractf32x4_ps (__m512 a, int imm8)

Synopsis

__m128 _mm512_extractf32x4_ps (__m512 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x4 xmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] 2: dst[127:0] := a[383:256] 3: dst[127:0] := a[511:384] ESAC dst[MAX:128] := 0
vextractf32x4
__m128 _mm512_mask_extractf32x4_ps (__m128 src, __mmask8 k, __m512 a, int imm8)

Synopsis

__m128 _mm512_mask_extractf32x4_ps (__m128 src, __mmask8 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x4 xmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] 2: tmp[127:0] := a[383:256] 3: tmp[127:0] := a[511:384] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vextractf32x4
__m128 _mm512_maskz_extractf32x4_ps (__mmask8 k, __m512 a, int imm8)

Synopsis

__m128 _mm512_maskz_extractf32x4_ps (__mmask8 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x4 xmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] 2: tmp[127:0] := a[383:256] 3: tmp[127:0] := a[511:384] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vextractf32x8
__m256 _mm512_extractf32x8_ps (__m512 a, int imm8)

Synopsis

__m256 _mm512_extractf32x8_ps (__m512 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x8
CPUID Flags: AVX512DQ

Description

Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[255:0] := a[255:0] 1: dst[255:0] := a[511:256] ESAC dst[MAX:256] := 0
vextractf32x8
__m256 _mm512_mask_extractf32x8_ps (__m256 src, __mmask8 k, __m512 a, int imm8)

Synopsis

__m256 _mm512_mask_extractf32x8_ps (__m256 src, __mmask8 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x8
CPUID Flags: AVX512DQ

Description

Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[255:0] := a[255:0] 1: tmp[255:0] := a[511:256] ESAC FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vextractf32x8
__m256 _mm512_maskz_extractf32x8_ps (__mmask8 k, __m512 a, int imm8)

Synopsis

__m256 _mm512_maskz_extractf32x8_ps (__mmask8 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x8
CPUID Flags: AVX512DQ

Description

Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[255:0] := a[255:0] 1: tmp[255:0] := a[511:256] ESAC FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vextractf64x2
__m128d _mm256_extractf64x2_pd (__m256d a, int imm8)

Synopsis

__m128d _mm256_extractf64x2_pd (__m256d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC dst[MAX:128] := 0
vextractf64x2
__m128d _mm256_mask_extractf64x2_pd (__m128d src, __mmask8 k, __m256d a, int imm8)

Synopsis

__m128d _mm256_mask_extractf64x2_pd (__m128d src, __mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] ESAC FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vextractf64x2
__m128d _mm256_maskz_extractf64x2_pd (__mmask8 k, __m256d a, int imm8)

Synopsis

__m128d _mm256_maskz_extractf64x2_pd (__mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] ESAC FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vextractf64x2
__m128d _mm512_extractf64x2_pd (__m512d a, int imm8)

Synopsis

__m128d _mm512_extractf64x2_pd (__m512d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x2
CPUID Flags: AVX512DQ

Description

Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] 2: dst[127:0] := a[383:256] 3: dst[127:0] := a[511:384] ESAC dst[MAX:128] := 0
vextractf64x2
__m128d _mm512_mask_extractf64x2_pd (__m128d src, __mmask8 k, __m512d a, int imm8)

Synopsis

__m128d _mm512_mask_extractf64x2_pd (__m128d src, __mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x2
CPUID Flags: AVX512DQ

Description

Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] 2: tmp[127:0] := a[383:256] 3: tmp[127:0] := a[511:384] ESAC FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vextractf64x2
__m128d _mm512_maskz_extractf64x2_pd (__mmask8 k, __m512d a, int imm8)

Synopsis

__m128d _mm512_maskz_extractf64x2_pd (__mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x2
CPUID Flags: AVX512DQ

Description

Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] 2: tmp[127:0] := a[383:256] 3: tmp[127:0] := a[511:384] ESAC FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vextractf64x4
__m256d _mm512_extractf64x4_pd (__m512d a, int imm8)

Synopsis

__m256d _mm512_extractf64x4_pd (__m512d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x4 ymm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[255:0] := a[255:0] 1: dst[255:0] := a[511:256] ESAC dst[MAX:256] := 0
vextractf64x4
__m256d _mm512_mask_extractf64x4_pd (__m256d src, __mmask8 k, __m512d a, int imm8)

Synopsis

__m256d _mm512_mask_extractf64x4_pd (__m256d src, __mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x4 ymm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[255:0] := a[255:0] 1: tmp[255:0] := a[511:256] ESAC FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vextractf64x4
__m256d _mm512_maskz_extractf64x4_pd (__mmask8 k, __m512d a, int imm8)

Synopsis

__m256d _mm512_maskz_extractf64x4_pd (__mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x4 ymm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[255:0] := a[255:0] 1: tmp[255:0] := a[511:256] ESAC FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vextracti128
__m128i _mm256_extracti128_si256 (__m256i a, const int imm8)

Synopsis

__m128i _mm256_extracti128_si256 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vextracti128 xmm, ymm, imm
CPUID Flags: AVX2

Description

Extract 128 bits (composed of integer data) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vextracti32x4
__m128i _mm256_extracti32x4_epi32 (__m256i a, int imm8)

Synopsis

__m128i _mm256_extracti32x4_epi32 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x4
CPUID Flags: AVX512VL + AVX512F

Description

Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC dst[MAX:128] := 0
vextracti32x4
__m128i _mm256_mask_extracti32x4_epi32 (__m128i src, __mmask8 k, __m256i a, int imm8)

Synopsis

__m128i _mm256_mask_extracti32x4_epi32 (__m128i src, __mmask8 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x4
CPUID Flags: AVX512VL + AVX512F

Description

Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vextracti32x4
__m128i _mm256_maskz_extracti32x4_epi32 (__mmask8 k, __m256i a, int imm8)

Synopsis

__m128i _mm256_maskz_extracti32x4_epi32 (__mmask8 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x4
CPUID Flags: AVX512VL + AVX512F

Description

Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vextracti32x4
__m128i _mm512_extracti32x4_epi32 (__m512i a, int imm8)

Synopsis

__m128i _mm512_extracti32x4_epi32 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x4 xmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] 2: dst[127:0] := a[383:256] 3: dst[127:0] := a[511:384] ESAC dst[MAX:128] := 0
vextracti32x4
__m128i _mm512_mask_extracti32x4_epi32 (__m128i src, __mmask8 k, __m512i a, int imm8)

Synopsis

__m128i _mm512_mask_extracti32x4_epi32 (__m128i src, __mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x4 xmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] 2: dst[127:0] := a[383:256] 3: dst[127:0] := a[511:384] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vextracti32x4
__m128i _mm512_maskz_extracti32x4_epi32 (__mmask8 k, __m512i a, int imm8)

Synopsis

__m128i _mm512_maskz_extracti32x4_epi32 (__mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x4 xmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] 2: dst[127:0] := a[383:256] 3: dst[127:0] := a[511:384] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vextracti32x8
__m256i _mm512_extracti32x8_epi32 (__m512i a, int imm8)

Synopsis

__m256i _mm512_extracti32x8_epi32 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x8
CPUID Flags: AVX512DQ

Description

Extract 256 bits (composed of 8 packed 32-bit integers) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[255:0] := a[255:0] 1: dst[255:0] := a[511:256] ESAC dst[MAX:256] := 0
vextracti32x8
__m256i _mm512_mask_extracti32x8_epi32 (__m256i src, __mmask8 k, __m512i a, int imm8)

Synopsis

__m256i _mm512_mask_extracti32x8_epi32 (__m256i src, __mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x8
CPUID Flags: AVX512DQ

Description

Extract 256 bits (composed of 8 packed 32-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[255:0] := a[255:0] 1: tmp[255:0] := a[511:256] ESAC FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vextracti32x8
__m256i _mm512_maskz_extracti32x8_epi32 (__mmask8 k, __m512i a, int imm8)

Synopsis

__m256i _mm512_maskz_extracti32x8_epi32 (__mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x8
CPUID Flags: AVX512DQ

Description

Extract 256 bits (composed of 8 packed 32-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[255:0] := a[255:0] 1: tmp[255:0] := a[511:256] ESAC FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vextracti64x2
__m128i _mm256_extracti64x2_epi64 (__m256i a, int imm8)

Synopsis

__m128i _mm256_extracti64x2_epi64 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract 128 bits (composed of 2 packed 64-bit integers) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC dst[MAX:128] := 0
vextracti64x2
__m128i _mm256_mask_extracti64x2_epi64 (__m128i src, __mmask8 k, __m256i a, int imm8)

Synopsis

__m128i _mm256_mask_extracti64x2_epi64 (__m128i src, __mmask8 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract 128 bits (composed of 2 packed 64-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] ESAC FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vextracti64x2
__m128i _mm256_maskz_extracti64x2_epi64 (__mmask8 k, __m256i a, int imm8)

Synopsis

__m128i _mm256_maskz_extracti64x2_epi64 (__mmask8 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract 128 bits (composed of 2 packed 64-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] ESAC FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vextracti64x2
__m128i _mm512_extracti64x2_epi64 (__m512i a, int imm8)

Synopsis

__m128i _mm512_extracti64x2_epi64 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x2
CPUID Flags: AVX512DQ

Description

Extract 128 bits (composed of 2 packed 64-bit integers) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] 2: dst[127:0] := a[383:256] 3: dst[127:0] := a[511:384] ESAC dst[MAX:128] := 0
vextracti64x2
__m128i _mm512_mask_extracti64x2_epi64 (__m128i src, __mmask8 k, __m512i a, int imm8)

Synopsis

__m128i _mm512_mask_extracti64x2_epi64 (__m128i src, __mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x2
CPUID Flags: AVX512DQ

Description

Extract 128 bits (composed of 2 packed 64-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] 2: tmp[127:0] := a[383:256] 3: tmp[127:0] := a[511:384] ESAC FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vextracti64x2
__m128i _mm512_maskz_extracti64x2_epi64 (__mmask8 k, __m512i a, int imm8)

Synopsis

__m128i _mm512_maskz_extracti64x2_epi64 (__mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x2
CPUID Flags: AVX512DQ

Description

Extract 128 bits (composed of 2 packed 64-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] 2: tmp[127:0] := a[383:256] 3: tmp[127:0] := a[511:384] ESAC FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vextracti64x4
__m256i _mm512_extracti64x4_epi64 (__m512i a, int imm8)

Synopsis

__m256i _mm512_extracti64x4_epi64 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x4 ymm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the result in dst.

Operation

CASE imm8[7:0] of 0: dst[255:0] := a[255:0] 1: dst[255:0] := a[511:256] ESAC dst[MAX:256] := 0
vextracti64x4
__m256i _mm512_mask_extracti64x4_epi64 (__m256i src, __mmask8 k, __m512i a, int imm8)

Synopsis

__m256i _mm512_mask_extracti64x4_epi64 (__m256i src, __mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x4 ymm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: dst[255:0] := a[255:0] 1: dst[255:0] := a[511:256] ESAC FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vextracti64x4
__m256i _mm512_maskz_extracti64x4_epi64 (__mmask8 k, __m512i a, int imm8)

Synopsis

__m256i _mm512_maskz_extracti64x4_epi64 (__mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x4 ymm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

CASE imm8[7:0] of 0: dst[255:0] := a[255:0] 1: dst[255:0] := a[511:256] ESAC FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovdqa32
void _mm512_extstore_epi32 (void * mt, __m512i v, _MM_DOWNCONV_EPI32_ENUM conv, int hint)

Synopsis

void _mm512_extstore_epi32 (void * mt, __m512i v, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovdqa32 m512 {k}, zmm
CPUID Flags: KNCNI

Description

Downconverts packed 32-bit integer elements stored in v to a smaller type depending on conv and stores them in memory location mt. hint indicates to the processor whether the data is non-temporal.

Operation

addr := MEM[mt] FOR j := 0 to 15 i := j*32 CASE conv OF _MM_DOWNCONV_EPI32_NONE: addr[i+31:i] := v[i+31:i] _MM_DOWNCONV_EPI32_UINT8: n := j*8 addr[n+7:n] := Int32ToUInt8(v[i+31:i]) _MM_DOWNCONV_EPI32_SINT8: n := j*8 addr[n+7:n] := Int32ToSInt8(v[i+31:i]) _MM_DOWNCONV_EPI32_UINT16: n := j*16 addr[n+15:n] := Int32ToUInt16(v[i+31:i]) _MM_DOWNCONV_EPI32_SINT16: n := j*16 addr[n+15:n] := Int32ToSInt16(v[i+31:i]) ESAC ENDFOR
vmovdqa32
void _mm512_mask_extstore_epi32 (void * mt, __mmask16 k, __m512i v, _MM_DOWNCONV_EPI32_ENUM conv, int hint)

Synopsis

void _mm512_mask_extstore_epi32 (void * mt, __mmask16 k, __m512i v, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovdqa32 m512 {k}, zmm
CPUID Flags: KNCNI

Description

Downconverts packed 32-bit integer elements stored in v to a smaller type depending on conv and stores them in memory location mt (elements in mt are unaltered when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.

Operation

addr := MEM[mt] FOR j := 0 to 15 i := j*32 IF k[j] CASE conv OF _MM_DOWNCONV_EPI32_NONE: addr[i+31:i] := v[i+31:i] _MM_DOWNCONV_EPI32_UINT8: n := j*8 addr[n+7:n] := Int32ToUInt8(v[i+31:i]) _MM_DOWNCONV_EPI32_SINT8: n := j*8 addr[n+7:n] := Int32ToSInt8(v[i+31:i]) _MM_DOWNCONV_EPI32_UINT16: n := j*16 addr[n+15:n] := Int32ToUInt16(v[i+31:i]) _MM_DOWNCONV_EPI32_SINT16: n := j*16 addr[n+15:n] := Int32ToSInt16(v[i+31:i]) ESAC FI ENDFOR
vmovdqa64
void _mm512_extstore_epi64 (void * mt, __m512i v, _MM_DOWNCONV_EPI64_ENUM conv, int hint)

Synopsis

void _mm512_extstore_epi64 (void * mt, __m512i v, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovdqa64 m512 {k}, zmm
CPUID Flags: KNCNI

Description

Downconverts packed 64-bit integer elements stored in v to a smaller type depending on conv and stores them in memory location mt. hint indicates to the processor whether the data is non-temporal.

Operation

addr := MEM[mt] FOR j := 0 to 7 i := j*64 CASE conv OF _MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v[i+63:i] ESAC ENDFOR
vmovdqa64
void _mm512_mask_extstore_epi64 (void * mt, __mmask8 k, __m512i v, _MM_DOWNCONV_EPI64_ENUM conv, int hint)

Synopsis

void _mm512_mask_extstore_epi64 (void * mt, __mmask8 k, __m512i v, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovdqa64 m512 {k}, zmm
CPUID Flags: KNCNI

Description

Downconverts packed 64-bit integer elements stored in v to a smaller type depending on conv and stores them in memory location mt (elements in mt are unaltered when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.

Operation

addr := MEM[mt] FOR j := 0 to 7 i := j*64 IF k[j] CASE conv OF _MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v[i+63:i] ESAC FI ENDFOR
vmovapd
void _mm512_extstore_pd (void * mt, __m512d v, _MM_DOWNCONV_PD_ENUM conv, int hint)

Synopsis

void _mm512_extstore_pd (void * mt, __m512d v, _MM_DOWNCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovapd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Downconverts packed double-precision (64-bit) floating-point elements stored in v to a smaller type depending on conv and stores them in memory location mt. hint indicates to the processor whether the data is non-temporal.

Operation

addr := MEM[mt] FOR j := 0 to 7 i := j*64 CASE conv OF _MM_DOWNCONV_PS_NONE: addr[i+63:i] := v[i+63:i] ESAC ENDFOR
vmovapd
void _mm512_mask_extstore_pd (void * mt, __mmask8 k, __m512d v, _MM_DOWNCONV_PD_ENUM conv, int hint)

Synopsis

void _mm512_mask_extstore_pd (void * mt, __mmask8 k, __m512d v, _MM_DOWNCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovapd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Downconverts packed double-precision (64-bit) floating-point elements stored in v to a smaller type depending on conv and stores them in memory location mt (elements in mt are unaltered when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.

Operation

addr := MEM[mt] FOR j := 0 to 7 i := j*64 CASE conv OF _MM_DOWNCONV_PD_NONE: IF k[j] mt[i+63:i] := v[i+63:i] FI ESAC ENDFOR
vmovaps
void _mm512_extstore_ps (void * mt, __m512 v, _MM_DOWNCONV_PS_ENUM conv, int hint)

Synopsis

void _mm512_extstore_ps (void * mt, __m512 v, _MM_DOWNCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovaps m512 {k}, zmm
CPUID Flags: KNCNI

Description

Downconverts packed single-precision (32-bit) floating-point elements stored in v to a smaller type depending on conv and stores them in memory location mt. hint indicates to the processor whether the data is non-temporal.

Operation

addr := MEM[mt] FOR j := 0 to 15 i := j*32 CASE conv OF _MM_DOWNCONV_PS_NONE: addr[i+31:i] := v[i+31:i] _MM_DOWNCONV_PS_FLOAT16: n := j*16 addr[n+15:n] := Float32ToFloat16(v[i+31:i]) _MM_DOWNCONV_PS_UINT8: n := j*8 addr[n+7:n] := Float32ToUInt8(v[i+31:i]) _MM_DOWNCONV_PS_SINT8: n := j*8 addr[n+7:n] := Float32ToSInt8(v[i+31:i]) _MM_DOWNCONV_PS_UINT16: n := j*16 addr[n+15:n] := Float32ToUInt16(v[i+31:i]) _MM_DOWNCONV_PS_SINT16: n := j*16 addr[n+15:n] := Float32ToSInt16(v[i+31:i]) ESAC ENDFOR
vmovaps
void _mm512_mask_extstore_ps (void * mt, __mmask16 k, __m512 v, _MM_DOWNCONV_PS_ENUM conv, int hint)

Synopsis

void _mm512_mask_extstore_ps (void * mt, __mmask16 k, __m512 v, _MM_DOWNCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovaps m512 {k}, zmm
CPUID Flags: KNCNI

Description

Downconverts packed single-precision (32-bit) floating-point elements stored in v to a smaller type depending on conv and stores them in memory location mt using writemask k (elements are not written to memory when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] CASE conv OF _MM_DOWNCONV_PS_NONE: mt[i+31:i] := v[i+31:i] _MM_DOWNCONV_PS_FLOAT16: n := j*16 mt[n+15:n] := Float32ToFloat16(v[i+31:i]) _MM_DOWNCONV_PS_UINT8: n := j*8 mt[n+7:n] := Float32ToUInt8(v[i+31:i]) _MM_DOWNCONV_PS_SINT8: n := j*8 mt[n+7:n] := Float32ToSInt8(v[i+31:i]) _MM_DOWNCONV_PS_UINT16: n := j*16 mt[n+15:n] := Float32ToUInt16(v[i+31:i]) _MM_DOWNCONV_PS_SINT16: n := j*16 mt[n+15:n] := Float32ToSInt16(v[i+31:i]) ESAC FI ENDFOR
vfixupimmpd
__m128d _mm_fixupimm_pd (__m128d a, __m128d b, __m128i c, int imm8)

Synopsis

__m128d _mm_fixupimm_pd (__m128d a, __m128d b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd
CPUID Flags: AVX512VL + AVX512F

Description

Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 1 i := j*64 dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
vfixupimmpd
__m128d _mm_mask_fixupimm_pd (__m128d a, __mmask8 k, __m128d b, __m128i c, int imm8)

Synopsis

__m128d _mm_mask_fixupimm_pd (__m128d a, __mmask8 k, __m128d b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd
CPUID Flags: AVX512VL + AVX512F

Description

Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfixupimmpd
__m128d _mm_maskz_fixupimm_pd (__mmask8 k, __m128d a, __m128d b, __m128i c, int imm8)

Synopsis

__m128d _mm_maskz_fixupimm_pd (__mmask8 k, __m128d a, __m128d b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd
CPUID Flags: AVX512VL + AVX512F

Description

Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfixupimmpd
__m256d _mm256_fixupimm_pd (__m256d a, __m256d b, __m256i c, int imm8)

Synopsis

__m256d _mm256_fixupimm_pd (__m256d a, __m256d b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd
CPUID Flags: AVX512VL + AVX512F

Description

Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 3 i := j*64 dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
vfixupimmpd
__m256d _mm256_mask_fixupimm_pd (__m256d a, __mmask8 k, __m256d b, __m256i c, int imm8)

Synopsis

__m256d _mm256_mask_fixupimm_pd (__m256d a, __mmask8 k, __m256d b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd
CPUID Flags: AVX512VL + AVX512F

Description

Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfixupimmpd
__m256d _mm256_maskz_fixupimm_pd (__mmask8 k, __m256d a, __m256d b, __m256i c, int imm8)

Synopsis

__m256d _mm256_maskz_fixupimm_pd (__mmask8 k, __m256d a, __m256d b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd
CPUID Flags: AVX512VL + AVX512F

Description

Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfixupimmpd
__m512d _mm512_fixupimm_pd (__m512d a, __m512d b, __m512i c, int imm8)

Synopsis

__m512d _mm512_fixupimm_pd (__m512d a, __m512d b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vfixupimmpd
__m512d _mm512_mask_fixupimm_pd (__m512d a, __mmask8 k, __m512d b, __m512i c, int imm8)

Synopsis

__m512d _mm512_mask_fixupimm_pd (__m512d a, __mmask8 k, __m512d b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfixupimmpd
__m512d _mm512_maskz_fixupimm_pd (__mmask8 k, __m512d a, __m512d b, __m512i c, int imm8)

Synopsis

__m512d _mm512_maskz_fixupimm_pd (__mmask8 k, __m512d a, __m512d b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfixupimmps
__m128 _mm_fixupimm_ps (__m128 a, __m128 b, __m128i c, int imm8)

Synopsis

__m128 _mm_fixupimm_ps (__m128 a, __m128 b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps
CPUID Flags: AVX512VL + AVX512F

Description

Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 3 i := j*32 dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
vfixupimmps
__m128 _mm_mask_fixupimm_ps (__m128 a, __mmask8 k, __m128 b, __m128i c, int imm8)

Synopsis

__m128 _mm_mask_fixupimm_ps (__m128 a, __mmask8 k, __m128 b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps
CPUID Flags: AVX512VL + AVX512F

Description

Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfixupimmps
__m128 _mm_maskz_fixupimm_ps (__mmask8 k, __m128 a, __m128 b, __m128i c, int imm8)

Synopsis

__m128 _mm_maskz_fixupimm_ps (__mmask8 k, __m128 a, __m128 b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps
CPUID Flags: AVX512VL + AVX512F

Description

Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfixupimmps
__m256 _mm256_fixupimm_ps (__m256 a, __m256 b, __m256i c, int imm8)

Synopsis

__m256 _mm256_fixupimm_ps (__m256 a, __m256 b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps
CPUID Flags: AVX512VL + AVX512F

Description

Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 7 i := j*32 dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
vfixupimmps
__m256 _mm256_mask_fixupimm_ps (__m256 a, __mmask8 k, __m256 b, __m256i c, int imm8)

Synopsis

__m256 _mm256_mask_fixupimm_ps (__m256 a, __mmask8 k, __m256 b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps
CPUID Flags: AVX512VL + AVX512F

Description

Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfixupimmps
__m256 _mm256_maskz_fixupimm_ps (__mmask8 k, __m256 a, __m256 b, __m256i c, int imm8)

Synopsis

__m256 _mm256_maskz_fixupimm_ps (__mmask8 k, __m256 a, __m256 b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps
CPUID Flags: AVX512VL + AVX512F

Description

Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfixupimmps
__m512 _mm512_fixupimm_ps (__m512 a, __m512 b, __m512i c, int imm8)

Synopsis

__m512 _mm512_fixupimm_ps (__m512 a, __m512 b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vfixupimmps
__m512 _mm512_mask_fixupimm_ps (__m512 a, __mmask16 k, __m512 b, __m512i c, int imm8)

Synopsis

__m512 _mm512_mask_fixupimm_ps (__m512 a, __mmask16 k, __m512 b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfixupimmps
__m512 _mm512_maskz_fixupimm_ps (__mmask16 k, __m512 a, __m512 b, __m512i c, int imm8)

Synopsis

__m512 _mm512_maskz_fixupimm_ps (__mmask16 k, __m512 a, __m512 b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfixupimmpd
__m512d _mm512_fixupimm_round_pd (__m512d a, __m512d b, __m512i c, int imm8, int rounding)

Synopsis

__m512d _mm512_fixupimm_round_pd (__m512d a, __m512d b, __m512i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmpd zmm {k}, zmm, zmm, imm {er}
CPUID Flags: AVX512F

Description

Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vfixupimmpd
__m512d _mm512_mask_fixupimm_round_pd (__m512d a, __mmask8 k, __m512d b, __m512i c, int imm8, int rounding)

Synopsis

__m512d _mm512_mask_fixupimm_round_pd (__m512d a, __mmask8 k, __m512d b, __m512i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmpd zmm {k}, zmm, zmm, imm {er}
CPUID Flags: AVX512F

Description

Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfixupimmpd
__m512d _mm512_maskz_fixupimm_round_pd (__mmask8 k, __m512d a, __m512d b, __m512i c, int imm8, int rounding)

Synopsis

__m512d _mm512_maskz_fixupimm_round_pd (__mmask8 k, __m512d a, __m512d b, __m512i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmpd zmm {k}, zmm, zmm, imm {er}
CPUID Flags: AVX512F

Description

Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfixupimmps
__m512 _mm512_fixupimm_round_ps (__m512 a, __m512 b, __m512i c, int imm8, int rounding)

Synopsis

__m512 _mm512_fixupimm_round_ps (__m512 a, __m512 b, __m512i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmps zmm {k}, zmm, zmm, imm {er}
CPUID Flags: AVX512F

Description

Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vfixupimmps
__m512 _mm512_mask_fixupimm_round_ps (__m512 a, __mmask16 k, __m512 b, __m512i c, int imm8, int rounding)

Synopsis

__m512 _mm512_mask_fixupimm_round_ps (__m512 a, __mmask16 k, __m512 b, __m512i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmps zmm {k}, zmm, zmm, imm {er}
CPUID Flags: AVX512F

Description

Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfixupimmps
__m512 _mm512_maskz_fixupimm_round_ps (__mmask16 k, __m512 a, __m512 b, __m512i c, int imm8, int rounding)

Synopsis

__m512 _mm512_maskz_fixupimm_round_ps (__mmask16 k, __m512 a, __m512 b, __m512i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmps zmm {k}, zmm, zmm, imm {er}
CPUID Flags: AVX512F

Description

Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfixupimmsd
__m128d _mm_fixupimm_round_sd (__m128d a, __m128d b, __m128i c, int imm8, int rounding)

Synopsis

__m128d _mm_fixupimm_round_sd (__m128d a, __m128d b, __m128i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmsd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vfixupimmsd
__m128d _mm_mask_fixupimm_round_sd (__m128d a, __mmask8 k, __m128d b, __m128i c, int imm8, int rounding)

Synopsis

__m128d _mm_mask_fixupimm_round_sd (__m128d a, __mmask8 k, __m128d b, __m128i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmsd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } IF k[0] dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfixupimmsd
__m128d _mm_maskz_fixupimm_round_sd (__mmask8 k, __m128d a, __m128d b, __m128i c, int imm8, int rounding)

Synopsis

__m128d _mm_maskz_fixupimm_round_sd (__mmask8 k, __m128d a, __m128d b, __m128i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmsd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } IF k[0] dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfixupimmss
__m128 _mm_fixupimm_round_ss (__m128 a, __m128 b, __m128i c, int imm8, int rounding)

Synopsis

__m128 _mm_fixupimm_round_ss (__m128 a, __m128 b, __m128i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmss xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vfixupimmss
__m128 _mm_mask_fixupimm_round_ss (__m128 a, __mmask8 k, __m128 b, __m128i c, int imm8, int rounding)

Synopsis

__m128 _mm_mask_fixupimm_round_ss (__m128 a, __mmask8 k, __m128 b, __m128i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmss xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } IF k[0] dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfixupimmss
__m128 _mm_maskz_fixupimm_round_ss (__mmask8 k, __m128 a, __m128 b, __m128i c, int imm8, int rounding)

Synopsis

__m128 _mm_maskz_fixupimm_round_ss (__mmask8 k, __m128 a, __m128 b, __m128i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmss xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } IF k[0] dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfixupimmsd
__m128d _mm_fixupimm_sd (__m128d a, __m128d b, __m128i c, int imm8)

Synopsis

__m128d _mm_fixupimm_sd (__m128d a, __m128d b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmsd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vfixupimmsd
__m128d _mm_mask_fixupimm_sd (__m128d a, __mmask8 k, __m128d b, __m128i c, int imm8)

Synopsis

__m128d _mm_mask_fixupimm_sd (__m128d a, __mmask8 k, __m128d b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmsd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } IF k[0] dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfixupimmsd
__m128d _mm_maskz_fixupimm_sd (__mmask8 k, __m128d a, __m128d b, __m128i c, int imm8)

Synopsis

__m128d _mm_maskz_fixupimm_sd (__mmask8 k, __m128d a, __m128d b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmsd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } IF k[0] dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfixupimmss
__m128 _mm_fixupimm_ss (__m128 a, __m128 b, __m128i c, int imm8)

Synopsis

__m128 _mm_fixupimm_ss (__m128 a, __m128 b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmss xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vfixupimmss
__m128 _mm_mask_fixupimm_ss (__m128 a, __mmask8 k, __m128 b, __m128i c, int imm8)

Synopsis

__m128 _mm_mask_fixupimm_ss (__m128 a, __mmask8 k, __m128 b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmss xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } IF k[0] dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfixupimmss
__m128 _mm_maskz_fixupimm_ss (__mmask8 k, __m128 a, __m128 b, __m128i c, int imm8)

Synopsis

__m128 _mm_maskz_fixupimm_ss (__mmask8 k, __m128 a, __m128 b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmss xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.

Operation

enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } IF k[0] dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfixupnanpd
__m512d _mm512_fixupnan_pd (__m512d v1, __m512d v2, __m512i v3)

Synopsis

__m512d _mm512_fixupnan_pd (__m512d v1, __m512d v2, __m512i v3)
#include "immintrin.h"
Instruction: vfixupnanpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Fixes up NaN's from packed double-precision (64-bit) floating-point elements in v1 and v2, storing the results in dst and storing the quietized NaN's from v1 in v3.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i]) v3[i+63:i] := QuietizeNaNs(v1[i+63:i]) ENDFOR dst[MAX:512] := 0
vfixupnanpd
__m512d _mm512_mask_fixupnan_pd (__m512d v1, __mmask8 k, __m512d v2, __m512i v3)

Synopsis

__m512d _mm512_mask_fixupnan_pd (__m512d v1, __mmask8 k, __m512d v2, __m512i v3)
#include "immintrin.h"
Instruction: vfixupnanpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Fixes up NaN's from packed double-precision (64-bit) floating-point elements in v1 and v2, storing the results in dst using writemask k (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from v1 are stored in v3.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i]) v3[i+63:i] := QuietizeNaNs(v1[i+63:i]) FI ENDFOR dst[MAX:512] := 0
vfixupnanps
__m512 _mm512_fixupnan_ps (__m512 v1, __m512 v2, __m512i v3)

Synopsis

__m512 _mm512_fixupnan_ps (__m512 v1, __m512 v2, __m512i v3)
#include "immintrin.h"
Instruction: vfixupnanps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Fixes up NaN's from packed single-precision (32-bit) floating-point elements in v1 and v2, storing the results in dst and storing the quietized NaN's from v1 in v3.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i]) v3[i+31:i] := QuietizeNaNs(v1[i+31:i]) ENDFOR dst[MAX:512] := 0
vfixupnanps
__m512 _mm512_mask_fixupnan_ps (__m512 v1, __mmask16 k, __m512 v2, __m512i v3)

Synopsis

__m512 _mm512_mask_fixupnan_ps (__m512 v1, __mmask16 k, __m512 v2, __m512i v3)
#include "immintrin.h"
Instruction: vfixupnanps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Fixes up NaN's from packed single-precision (32-bit) floating-point elements in v1 and v2, storing the results in dst using writemask k (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from v1 are stored in v3.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i]) v3[i+31:i] := QuietizeNaNs(v1[i+31:i]) FI ENDFOR dst[MAX:512] := 0
roundpd
__m128d _mm_floor_pd (__m128d a)

Synopsis

__m128d _mm_floor_pd (__m128d a)
#include "smmintrin.h"
Instruction: roundpd xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := FLOOR(a[i+63:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell62
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vroundpd
__m256d _mm256_floor_pd (__m256d a)

Synopsis

__m256d _mm256_floor_pd (__m256d a)
#include "immintrin.h"
Instruction: vroundpd ymm, ymm, imm
CPUID Flags: AVX

Description

Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := FLOOR(a[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell61
Ivy Bridge31
Sandy Bridge31
...
__m512d _mm512_floor_pd (__m512d a)

Synopsis

__m512d _mm512_floor_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := FLOOR(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_floor_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_floor_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FLOOR(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
roundps
__m128 _mm_floor_ps (__m128 a)

Synopsis

__m128 _mm_floor_ps (__m128 a)
#include "smmintrin.h"
Instruction: roundps xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := FLOOR(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell62
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vroundps
__m256 _mm256_floor_ps (__m256 a)

Synopsis

__m256 _mm256_floor_ps (__m256 a)
#include "immintrin.h"
Instruction: vroundps ymm, ymm, imm
CPUID Flags: AVX

Description

Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := FLOOR(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell61
Ivy Bridge31
Sandy Bridge31
...
__m512 _mm512_floor_ps (__m512 a)

Synopsis

__m512 _mm512_floor_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := FLOOR(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_floor_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_floor_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FLOOR(a[i+31:i]) ELSE dst[i+31:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
roundsd
__m128d _mm_floor_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_floor_sd (__m128d a, __m128d b)
#include "smmintrin.h"
Instruction: roundsd xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Round the lower double-precision (64-bit) floating-point element in b down to an integer value, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := FLOOR(b[63:0]) dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell62
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
roundss
__m128 _mm_floor_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_floor_ss (__m128 a, __m128 b)
#include "smmintrin.h"
Instruction: roundss xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Round the lower single-precision (32-bit) floating-point element in b down to an integer value, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := FLOOR(b[31:0]) dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell62
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vpmadd231d
__m512i _mm512_fmadd_epi32 (__m512i a, __m512i b, __m512i c)

Synopsis

__m512i _mm512_fmadd_epi32 (__m512i a, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd231d zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Multiply packed 32-bit integer elements in a and b, add the intermediate result to packed elements in c and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ENDFOR dst[MAX:512] := 0
vpmadd231d
__m512i _mm512_mask_fmadd_epi32 (__m512i a, __mmask16 k, __m512i b, __m512i c)

Synopsis

__m512i _mm512_mask_fmadd_epi32 (__m512i a, __mmask16 k, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd231d zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Multiply packed 32-bit integer elements in a and b, add the intermediate result to packed elements in c and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmadd231d
__m512i _mm512_mask3_fmadd_epi32 (__m512i a, __m512i b, __m512i c, __mmask16 k)

Synopsis

__m512i _mm512_mask3_fmadd_epi32 (__m512i a, __m512i b, __m512i c, __mmask16 k)
#include "immintrin.h"
Instruction: vpmadd231d zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Multiply packed 32-bit integer elements in a and b, add the intermediate result to packed elements in c and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m128d _mm_fmadd_pd (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_fmadd_pd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmadd132pd xmm, xmm, xmm
             vfmadd213pd xmm, xmm, xmm
             vfmadd231pd xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m128d _mm_mask_fmadd_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)

Synopsis

__m128d _mm_mask_fmadd_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmadd132pd
             vfmadd213pd
             vfmadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m128d _mm_mask3_fmadd_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)

Synopsis

__m128d _mm_mask3_fmadd_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132pd
             vfmadd213pd
             vfmadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m128d _mm_maskz_fmadd_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_maskz_fmadd_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmadd132pd
             vfmadd213pd
             vfmadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)

Synopsis

__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmadd132pd ymm, ymm, ymm
             vfmadd213pd ymm, ymm, ymm
             vfmadd231pd ymm, ymm, ymm
CPUID Flags: FMA

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m256d _mm256_mask_fmadd_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)

Synopsis

__m256d _mm256_mask_fmadd_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmadd132pd
             vfmadd213pd
             vfmadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m256d _mm256_mask3_fmadd_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)

Synopsis

__m256d _mm256_mask3_fmadd_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132pd
             vfmadd213pd
             vfmadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m256d _mm256_maskz_fmadd_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)

Synopsis

__m256d _mm256_maskz_fmadd_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmadd132pd
             vfmadd213pd
             vfmadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_fmadd_pd (__m512d a, __m512d b, __m512d c)

Synopsis

__m512d _mm512_fmadd_pd (__m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm
             vfmadd213pd zmm {k}, zmm, zmm
             vfmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ENDFOR dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_mask_fmadd_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)

Synopsis

__m512d _mm512_mask_fmadd_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm
             vfmadd213pd zmm {k}, zmm, zmm
             vfmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_mask3_fmadd_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)

Synopsis

__m512d _mm512_mask3_fmadd_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm
             vfmadd213pd zmm {k}, zmm, zmm
             vfmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_maskz_fmadd_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)

Synopsis

__m512d _mm512_maskz_fmadd_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm
             vfmadd213pd zmm {k}, zmm, zmm
             vfmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m128 _mm_fmadd_ps (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_fmadd_ps (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmadd132ps xmm, xmm, xmm
             vfmadd213ps xmm, xmm, xmm
             vfmadd231ps xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m128 _mm_mask_fmadd_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)

Synopsis

__m128 _mm_mask_fmadd_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmadd132ps
             vfmadd213ps
             vfmadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m128 _mm_mask3_fmadd_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)

Synopsis

__m128 _mm_mask3_fmadd_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132ps
             vfmadd213ps
             vfmadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m128 _mm_maskz_fmadd_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_maskz_fmadd_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmadd132ps
             vfmadd213ps
             vfmadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m256 _mm256_fmadd_ps (__m256 a, __m256 b, __m256 c)

Synopsis

__m256 _mm256_fmadd_ps (__m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmadd132ps ymm, ymm, ymm
             vfmadd213ps ymm, ymm, ymm
             vfmadd231ps ymm, ymm, ymm
CPUID Flags: FMA

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m256 _mm256_mask_fmadd_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)

Synopsis

__m256 _mm256_mask_fmadd_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmadd132ps
             vfmadd213ps
             vfmadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m256 _mm256_mask3_fmadd_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)

Synopsis

__m256 _mm256_mask3_fmadd_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132ps
             vfmadd213ps
             vfmadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m256 _mm256_maskz_fmadd_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)

Synopsis

__m256 _mm256_maskz_fmadd_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmadd132ps
             vfmadd213ps
             vfmadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_fmadd_ps (__m512 a, __m512 b, __m512 c)

Synopsis

__m512 _mm512_fmadd_ps (__m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm
             vfmadd213ps zmm {k}, zmm, zmm
             vfmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ENDFOR dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_mask_fmadd_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)

Synopsis

__m512 _mm512_mask_fmadd_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm
             vfmadd213ps zmm {k}, zmm, zmm
             vfmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_mask3_fmadd_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)

Synopsis

__m512 _mm512_mask3_fmadd_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm
             vfmadd213ps zmm {k}, zmm, zmm
             vfmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_maskz_fmadd_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)

Synopsis

__m512 _mm512_maskz_fmadd_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm
             vfmadd213ps zmm {k}, zmm, zmm
             vfmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_fmadd_round_pd (__m512d a, __m512d b, __m512d c, int rounding)

Synopsis

__m512d _mm512_fmadd_round_pd (__m512d a, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm {er}
             vfmadd213pd zmm {k}, zmm, zmm {er}
             vfmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ENDFOR dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_mask_fmadd_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)

Synopsis

__m512d _mm512_mask_fmadd_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm {er}
             vfmadd213pd zmm {k}, zmm, zmm {er}
             vfmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_mask3_fmadd_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)

Synopsis

__m512d _mm512_mask3_fmadd_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm {er}
             vfmadd213pd zmm {k}, zmm, zmm {er}
             vfmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_maskz_fmadd_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)

Synopsis

__m512d _mm512_maskz_fmadd_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm {er}
             vfmadd213pd zmm {k}, zmm, zmm {er}
             vfmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_fmadd_round_ps (__m512 a, __m512 b, __m512 c, int rounding)

Synopsis

__m512 _mm512_fmadd_round_ps (__m512 a, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm {er}
             vfmadd213ps zmm {k}, zmm, zmm {er}
             vfmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ENDFOR dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_mask_fmadd_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)

Synopsis

__m512 _mm512_mask_fmadd_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm {er}
             vfmadd213ps zmm {k}, zmm, zmm {er}
             vfmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_mask3_fmadd_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)

Synopsis

__m512 _mm512_mask3_fmadd_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm {er}
             vfmadd213ps zmm {k}, zmm, zmm {er}
             vfmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_maskz_fmadd_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)

Synopsis

__m512 _mm512_maskz_fmadd_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm {er}
             vfmadd213ps zmm {k}, zmm, zmm {er}
             vfmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in a using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] a[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_mask_fmadd_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)

Synopsis

__m128d _mm_mask_fmadd_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132sd xmm {k}, xmm, xmm {er}
             vfmadd213sd xmm {k}, xmm, xmm {er}
             vfmadd231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_mask3_fmadd_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)

Synopsis

__m128d _mm_mask3_fmadd_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfmadd132sd xmm {k}, xmm, xmm {er}
             vfmadd213sd xmm {k}, xmm, xmm {er}
             vfmadd231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_maskz_fmadd_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)

Synopsis

__m128d _mm_maskz_fmadd_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132sd xmm {k}, xmm, xmm {er}
             vfmadd213sd xmm {k}, xmm, xmm {er}
             vfmadd231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_mask_fmadd_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)

Synopsis

__m128 _mm_mask_fmadd_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132ss xmm {k}, xmm, xmm {er}
             vfmadd213ss xmm {k}, xmm, xmm {er}
             vfmadd231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_mask3_fmadd_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)

Synopsis

__m128 _mm_mask3_fmadd_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfmadd132ss xmm {k}, xmm, xmm {er}
             vfmadd213ss xmm {k}, xmm, xmm {er}
             vfmadd231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_maskz_fmadd_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)

Synopsis

__m128 _mm_maskz_fmadd_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132ss xmm {k}, xmm, xmm {er}
             vfmadd213ss xmm {k}, xmm, xmm {er}
             vfmadd231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_fmadd_sd (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_fmadd_sd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmadd132sd xmm, xmm, xmm
             vfmadd213sd xmm, xmm, xmm
             vfmadd231sd xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_mask_fmadd_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)

Synopsis

__m128d _mm_mask_fmadd_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmadd132sd xmm {k}, xmm, xmm
             vfmadd213sd xmm {k}, xmm, xmm
             vfmadd231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_mask3_fmadd_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)

Synopsis

__m128d _mm_mask3_fmadd_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132sd xmm {k}, xmm, xmm
             vfmadd213sd xmm {k}, xmm, xmm
             vfmadd231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_maskz_fmadd_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_maskz_fmadd_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmadd132sd xmm {k}, xmm, xmm
             vfmadd213sd xmm {k}, xmm, xmm
             vfmadd231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_fmadd_ss (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_fmadd_ss (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmadd132ss xmm, xmm, xmm
             vfmadd213ss xmm, xmm, xmm
             vfmadd231ss xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_mask_fmadd_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)

Synopsis

__m128 _mm_mask_fmadd_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmadd132ss xmm {k}, xmm, xmm
             vfmadd213ss xmm {k}, xmm, xmm
             vfmadd231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_mask3_fmadd_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)

Synopsis

__m128 _mm_mask3_fmadd_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132ss xmm {k}, xmm, xmm
             vfmadd213ss xmm {k}, xmm, xmm
             vfmadd231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_maskz_fmadd_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_maskz_fmadd_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmadd132ss xmm {k}, xmm, xmm
             vfmadd213ss xmm {k}, xmm, xmm
             vfmadd231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vpmadd233d
__m512i _mm512_fmadd233_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_fmadd233_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmadd233d zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Multiply packed 32-bit integer elements in each 4-element set of a and by element 1 of the corresponding 4-element set from b, add the intermediate result to element 0 of the corresponding 4-element set from b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 base := (j & ~0x3) * 32 scale[31:0] := b[base+63:base+32] bias[31:0] := b[base+31:base] dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] ENDFOR dst[MAX:512] := 0
vpmadd233d
__m512i _mm512_mask_fmadd233_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_fmadd233_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmadd233d zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Multiply packed 32-bit integer elements in each 4-element set of a and by element 1 of the corresponding 4-element set from b, add the intermediate result to element 0 of the corresponding 4-element set from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] base := (j & ~0x3) * 32 scale[31:0] := b[base+63:base+32] bias[31:0] := b[base+31:base] dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmadd233ps
__m512 _mm512_fmadd233_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_fmadd233_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vfmadd233ps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Performs multiplication between single-precision (32-bit) floating-point elements in a and b and adds the result to the elements in b, storing the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + b[i+31:i] ENDFOR dst[MAX:512] := 0
vfmadd233ps
__m512 _mm512_mask_fmadd233_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_fmadd233_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vfmadd233ps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Performs multiplication between single-precision (32-bit) floating-point elements in a and b and adds the result to the elements in b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmadd233ps
__m512 _mm512_fmadd233_round_ps (__m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_fmadd233_round_ps (__m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vfmadd233ps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of a and by element 1 of the corresponding 4-element set from b, add the intermediate result to element 0 of the corresponding 4-element set from b, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 base := (j & ~0x3) * 32 scale[31:0] := b[base+63:base+32] bias[31:0] := b[base+31:base] dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] ENDFOR dst[MAX:512] := 0
vfmadd233ps
__m512 _mm512_mask_fmadd233_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_mask_fmadd233_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vfmadd233ps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of a and by element 1 of the corresponding 4-element set from b, add the intermediate result to element 0 of the corresponding 4-element set from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] base := (j & ~0x3) * 32 scale[31:0] := b[base+63:base+32] bias[31:0] := b[base+31:base] dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m128d _mm_fmaddsub_pd (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_fmaddsub_pd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd xmm, xmm, xmm
             vfmaddsub213pd xmm, xmm, xmm
             vfmaddsub231pd xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m128d _mm_mask_fmaddsub_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)

Synopsis

__m128d _mm_mask_fmaddsub_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd
             vfmaddsub213pd
             vfmaddsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m128d _mm_mask3_fmaddsub_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)

Synopsis

__m128d _mm_mask3_fmaddsub_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmaddsub132pd
             vfmaddsub213pd
             vfmaddsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m128d _mm_maskz_fmaddsub_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_maskz_fmaddsub_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd
             vfmaddsub213pd
             vfmaddsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m256d _mm256_fmaddsub_pd (__m256d a, __m256d b, __m256d c)

Synopsis

__m256d _mm256_fmaddsub_pd (__m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd ymm, ymm, ymm
             vfmaddsub213pd ymm, ymm, ymm
             vfmaddsub231pd ymm, ymm, ymm
CPUID Flags: FMA

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m256d _mm256_mask_fmaddsub_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)

Synopsis

__m256d _mm256_mask_fmaddsub_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd
             vfmaddsub213pd
             vfmaddsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m256d _mm256_mask3_fmaddsub_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)

Synopsis

__m256d _mm256_mask3_fmaddsub_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmaddsub132pd
             vfmaddsub213pd
             vfmaddsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m256d _mm256_maskz_fmaddsub_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)

Synopsis

__m256d _mm256_maskz_fmaddsub_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd
             vfmaddsub213pd
             vfmaddsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_fmaddsub_pd (__m512d a, __m512d b, __m512d c)

Synopsis

__m512d _mm512_fmaddsub_pd (__m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm
             vfmaddsub213pd zmm {k}, zmm, zmm
             vfmaddsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_mask_fmaddsub_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)

Synopsis

__m512d _mm512_mask_fmaddsub_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm
             vfmaddsub213pd zmm {k}, zmm, zmm
             vfmaddsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_mask3_fmaddsub_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)

Synopsis

__m512d _mm512_mask3_fmaddsub_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm
             vfmaddsub213pd zmm {k}, zmm, zmm
             vfmaddsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_maskz_fmaddsub_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)

Synopsis

__m512d _mm512_maskz_fmaddsub_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm
             vfmaddsub213pd zmm {k}, zmm, zmm
             vfmaddsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m128 _mm_fmaddsub_ps (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_fmaddsub_ps (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps xmm, xmm, xmm
             vfmaddsub213ps xmm, xmm, xmm
             vfmaddsub231ps xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m128 _mm_mask_fmaddsub_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)

Synopsis

__m128 _mm_mask_fmaddsub_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps
             vfmaddsub213ps
             vfmaddsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m128 _mm_mask3_fmaddsub_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)

Synopsis

__m128 _mm_mask3_fmaddsub_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmaddsub132ps
             vfmaddsub213ps
             vfmaddsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m128 _mm_maskz_fmaddsub_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_maskz_fmaddsub_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps
             vfmaddsub213ps
             vfmaddsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m256 _mm256_fmaddsub_ps (__m256 a, __m256 b, __m256 c)

Synopsis

__m256 _mm256_fmaddsub_ps (__m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps ymm, ymm, ymm
             vfmaddsub213ps ymm, ymm, ymm
             vfmaddsub231ps ymm, ymm, ymm
CPUID Flags: FMA

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m256 _mm256_mask_fmaddsub_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)

Synopsis

__m256 _mm256_mask_fmaddsub_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps
             vfmaddsub213ps
             vfmaddsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m256 _mm256_mask3_fmaddsub_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)

Synopsis

__m256 _mm256_mask3_fmaddsub_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmaddsub132ps
             vfmaddsub213ps
             vfmaddsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m256 _mm256_maskz_fmaddsub_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)

Synopsis

__m256 _mm256_maskz_fmaddsub_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps
             vfmaddsub213ps
             vfmaddsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_fmaddsub_ps (__m512 a, __m512 b, __m512 c)

Synopsis

__m512 _mm512_fmaddsub_ps (__m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm
             vfmaddsub213ps zmm {k}, zmm, zmm
             vfmaddsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_mask_fmaddsub_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)

Synopsis

__m512 _mm512_mask_fmaddsub_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm
             vfmaddsub213ps zmm {k}, zmm, zmm
             vfmaddsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_mask3_fmaddsub_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)

Synopsis

__m512 _mm512_mask3_fmaddsub_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm
             vfmaddsub213ps zmm {k}, zmm, zmm
             vfmaddsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_maskz_fmaddsub_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)

Synopsis

__m512 _mm512_maskz_fmaddsub_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm
             vfmaddsub213ps zmm {k}, zmm, zmm
             vfmaddsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_fmaddsub_round_pd (__m512d a, __m512d b, __m512d c, const int rounding)

Synopsis

__m512d _mm512_fmaddsub_round_pd (__m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm {er}
             vfmaddsub213pd zmm {k}, zmm, zmm {er}
             vfmaddsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_mask_fmaddsub_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, const int rounding)

Synopsis

__m512d _mm512_mask_fmaddsub_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm {er}
             vfmaddsub213pd zmm {k}, zmm, zmm {er}
             vfmaddsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_mask3_fmaddsub_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, const int rounding)

Synopsis

__m512d _mm512_mask3_fmaddsub_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm {er}
             vfmaddsub213pd zmm {k}, zmm, zmm {er}
             vfmaddsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_maskz_fmaddsub_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)

Synopsis

__m512d _mm512_maskz_fmaddsub_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm {er}
             vfmaddsub213pd zmm {k}, zmm, zmm {er}
             vfmaddsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_fmaddsub_round_ps (__m512 a, __m512 b, __m512 c, const int rounding)

Synopsis

__m512 _mm512_fmaddsub_round_ps (__m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm {er}
             vfmaddsub213ps zmm {k}, zmm, zmm {er}
             vfmaddsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_mask_fmaddsub_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, const int rounding)

Synopsis

__m512 _mm512_mask_fmaddsub_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm {er}
             vfmaddsub213ps zmm {k}, zmm, zmm {er}
             vfmaddsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_mask3_fmaddsub_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, const int rounding)

Synopsis

__m512 _mm512_mask3_fmaddsub_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm {er}
             vfmaddsub213ps zmm {k}, zmm, zmm {er}
             vfmaddsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_maskz_fmaddsub_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)

Synopsis

__m512 _mm512_maskz_fmaddsub_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm {er}
             vfmaddsub213ps zmm {k}, zmm, zmm {er}
             vfmaddsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m128d _mm_fmsub_pd (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_fmsub_pd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsub132pd xmm, xmm, xmm
             vfmsub213pd xmm, xmm, xmm
             vfmsub231pd xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m128d _mm_mask_fmsub_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)

Synopsis

__m128d _mm_mask_fmsub_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsub132pd
             vfmsub213pd
             vfmsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m128d _mm_mask3_fmsub_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)

Synopsis

__m128d _mm_mask3_fmsub_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132pd
             vfmsub213pd
             vfmsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m128d _mm_maskz_fmsub_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_maskz_fmsub_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsub132pd
             vfmsub213pd
             vfmsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m256d _mm256_fmsub_pd (__m256d a, __m256d b, __m256d c)

Synopsis

__m256d _mm256_fmsub_pd (__m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmsub132pd ymm, ymm, ymm
             vfmsub213pd ymm, ymm, ymm
             vfmsub231pd ymm, ymm, ymm
CPUID Flags: FMA

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m256d _mm256_mask_fmsub_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)

Synopsis

__m256d _mm256_mask_fmsub_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmsub132pd
             vfmsub213pd
             vfmsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m256d _mm256_mask3_fmsub_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)

Synopsis

__m256d _mm256_mask3_fmsub_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132pd
             vfmsub213pd
             vfmsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m256d _mm256_maskz_fmsub_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)

Synopsis

__m256d _mm256_maskz_fmsub_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmsub132pd
             vfmsub213pd
             vfmsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_fmsub_pd (__m512d a, __m512d b, __m512d c)

Synopsis

__m512d _mm512_fmsub_pd (__m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm
             vfmsub213pd zmm {k}, zmm, zmm
             vfmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ENDFOR dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_mask_fmsub_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)

Synopsis

__m512d _mm512_mask_fmsub_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm
             vfmsub213pd zmm {k}, zmm, zmm
             vfmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_mask3_fmsub_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)

Synopsis

__m512d _mm512_mask3_fmsub_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm
             vfmsub213pd zmm {k}, zmm, zmm
             vfmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_maskz_fmsub_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)

Synopsis

__m512d _mm512_maskz_fmsub_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm
             vfmsub213pd zmm {k}, zmm, zmm
             vfmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsub132ps xmm, xmm, xmm
             vfmsub213ps xmm, xmm, xmm
             vfmsub231ps xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m128 _mm_mask_fmsub_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)

Synopsis

__m128 _mm_mask_fmsub_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsub132ps
             vfmsub213ps
             vfmsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m128 _mm_mask3_fmsub_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)

Synopsis

__m128 _mm_mask3_fmsub_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132ps
             vfmsub213ps
             vfmsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m128 _mm_maskz_fmsub_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_maskz_fmsub_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsub132ps
             vfmsub213ps
             vfmsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m256 _mm256_fmsub_ps (__m256 a, __m256 b, __m256 c)

Synopsis

__m256 _mm256_fmsub_ps (__m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmsub132ps ymm, ymm, ymm
             vfmsub213ps ymm, ymm, ymm
             vfmsub231ps ymm, ymm, ymm
CPUID Flags: FMA

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m256 _mm256_mask_fmsub_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)

Synopsis

__m256 _mm256_mask_fmsub_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmsub132ps
             vfmsub213ps
             vfmsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m256 _mm256_mask3_fmsub_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)

Synopsis

__m256 _mm256_mask3_fmsub_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132ps
             vfmsub213ps
             vfmsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m256 _mm256_maskz_fmsub_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)

Synopsis

__m256 _mm256_maskz_fmsub_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmsub132ps
             vfmsub213ps
             vfmsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_fmsub_ps (__m512 a, __m512 b, __m512 c)

Synopsis

__m512 _mm512_fmsub_ps (__m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm
             vfmsub213ps zmm {k}, zmm, zmm
             vfmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ENDFOR dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_mask_fmsub_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)

Synopsis

__m512 _mm512_mask_fmsub_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm
             vfmsub213ps zmm {k}, zmm, zmm
             vfmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_mask3_fmsub_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)

Synopsis

__m512 _mm512_mask3_fmsub_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm
             vfmsub213ps zmm {k}, zmm, zmm
             vfmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_maskz_fmsub_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)

Synopsis

__m512 _mm512_maskz_fmsub_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm
             vfmsub213ps zmm {k}, zmm, zmm
             vfmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_fmsub_round_pd (__m512d a, __m512d b, __m512d c, int rounding)

Synopsis

__m512d _mm512_fmsub_round_pd (__m512d a, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm {er}
             vfmsub213pd zmm {k}, zmm, zmm {er}
             vfmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ENDFOR dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_mask_fmsub_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)

Synopsis

__m512d _mm512_mask_fmsub_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm {er}
             vfmsub213pd zmm {k}, zmm, zmm {er}
             vfmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_mask3_fmsub_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)

Synopsis

__m512d _mm512_mask3_fmsub_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm {er}
             vfmsub213pd zmm {k}, zmm, zmm {er}
             vfmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_maskz_fmsub_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)

Synopsis

__m512d _mm512_maskz_fmsub_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm {er}
             vfmsub213pd zmm {k}, zmm, zmm {er}
             vfmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_fmsub_round_ps (__m512 a, __m512 b, __m512 c, int rounding)

Synopsis

__m512 _mm512_fmsub_round_ps (__m512 a, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm {er}
             vfmsub213ps zmm {k}, zmm, zmm {er}
             vfmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ENDFOR dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_mask_fmsub_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)

Synopsis

__m512 _mm512_mask_fmsub_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm {er}
             vfmsub213ps zmm {k}, zmm, zmm {er}
             vfmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_mask3_fmsub_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)

Synopsis

__m512 _mm512_mask3_fmsub_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm {er}
             vfmsub213ps zmm {k}, zmm, zmm {er}
             vfmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_maskz_fmsub_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)

Synopsis

__m512 _mm512_maskz_fmsub_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm {er}
             vfmsub213ps zmm {k}, zmm, zmm {er}
             vfmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_mask_fmsub_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)

Synopsis

__m128d _mm_mask_fmsub_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132sd xmm {k}, xmm, xmm {er}
             vfmsub213sd xmm {k}, xmm, xmm {er}
             vfmsub231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_mask3_fmsub_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)

Synopsis

__m128d _mm_mask3_fmsub_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfmsub132sd xmm {k}, xmm, xmm {er}
             vfmsub213sd xmm {k}, xmm, xmm {er}
             vfmsub231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_maskz_fmsub_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)

Synopsis

__m128d _mm_maskz_fmsub_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132sd xmm {k}, xmm, xmm {er}
             vfmsub213sd xmm {k}, xmm, xmm {er}
             vfmsub231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_mask_fmsub_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)

Synopsis

__m128 _mm_mask_fmsub_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132ss xmm {k}, xmm, xmm {er}
             vfmsub213ss xmm {k}, xmm, xmm {er}
             vfmsub231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_mask3_fmsub_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)

Synopsis

__m128 _mm_mask3_fmsub_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfmsub132ss xmm {k}, xmm, xmm {er}
             vfmsub213ss xmm {k}, xmm, xmm {er}
             vfmsub231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_maskz_fmsub_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)

Synopsis

__m128 _mm_maskz_fmsub_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132ss xmm {k}, xmm, xmm {er}
             vfmsub213ss xmm {k}, xmm, xmm {er}
             vfmsub231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_fmsub_sd (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_fmsub_sd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsub132sd xmm, xmm, xmm
             vfmsub213sd xmm, xmm, xmm
             vfmsub231sd xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_mask_fmsub_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)

Synopsis

__m128d _mm_mask_fmsub_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsub132sd xmm {k}, xmm, xmm
             vfmsub213sd xmm {k}, xmm, xmm
             vfmsub231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_mask3_fmsub_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)

Synopsis

__m128d _mm_mask3_fmsub_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132sd xmm {k}, xmm, xmm
             vfmsub213sd xmm {k}, xmm, xmm
             vfmsub231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_maskz_fmsub_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_maskz_fmsub_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsub132sd xmm {k}, xmm, xmm
             vfmsub213sd xmm {k}, xmm, xmm
             vfmsub231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_fmsub_ss (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_fmsub_ss (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsub132ss xmm, xmm, xmm
             vfmsub213ss xmm, xmm, xmm
             vfmsub231ss xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_mask_fmsub_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)

Synopsis

__m128 _mm_mask_fmsub_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsub132ss xmm {k}, xmm, xmm
             vfmsub213ss xmm {k}, xmm, xmm
             vfmsub231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_mask3_fmsub_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)

Synopsis

__m128 _mm_mask3_fmsub_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132ss xmm {k}, xmm, xmm
             vfmsub213ss xmm {k}, xmm, xmm
             vfmsub231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_maskz_fmsub_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_maskz_fmsub_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsub132ss xmm {k}, xmm, xmm
             vfmsub213ss xmm {k}, xmm, xmm
             vfmsub231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m128d _mm_fmsubadd_pd (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_fmsubadd_pd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd xmm, xmm, xmm
             vfmsubadd213pd xmm, xmm, xmm
             vfmsubadd231pd xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m128d _mm_mask_fmsubadd_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)

Synopsis

__m128d _mm_mask_fmsubadd_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd
             vfmsubadd213pd
             vfmsubadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m128d _mm_mask3_fmsubadd_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)

Synopsis

__m128d _mm_mask3_fmsubadd_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsubadd132pd
             vfmsubadd213pd
             vfmsubadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m128d _mm_maskz_fmsubadd_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_maskz_fmsubadd_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd
             vfmsubadd213pd
             vfmsubadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m256d _mm256_fmsubadd_pd (__m256d a, __m256d b, __m256d c)

Synopsis

__m256d _mm256_fmsubadd_pd (__m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd ymm, ymm, ymm
             vfmsubadd213pd ymm, ymm, ymm
             vfmsubadd231pd ymm, ymm, ymm
CPUID Flags: FMA

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m256d _mm256_mask_fmsubadd_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)

Synopsis

__m256d _mm256_mask_fmsubadd_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd
             vfmsubadd213pd
             vfmsubadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m256d _mm256_mask3_fmsubadd_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)

Synopsis

__m256d _mm256_mask3_fmsubadd_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsubadd132pd
             vfmsubadd213pd
             vfmsubadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m256d _mm256_maskz_fmsubadd_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)

Synopsis

__m256d _mm256_maskz_fmsubadd_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd
             vfmsubadd213pd
             vfmsubadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_fmsubadd_pd (__m512d a, __m512d b, __m512d c)

Synopsis

__m512d _mm512_fmsubadd_pd (__m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm
             vfmsubadd213pd zmm {k}, zmm, zmm
             vfmsubadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_mask_fmsubadd_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)

Synopsis

__m512d _mm512_mask_fmsubadd_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm
             vfmsubadd213pd zmm {k}, zmm, zmm
             vfmsubadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_mask3_fmsubadd_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)

Synopsis

__m512d _mm512_mask3_fmsubadd_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm
             vfmsubadd213pd zmm {k}, zmm, zmm
             vfmsubadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_maskz_fmsubadd_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)

Synopsis

__m512d _mm512_maskz_fmsubadd_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm
             vfmsubadd213pd zmm {k}, zmm, zmm
             vfmsubadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m128 _mm_fmsubadd_ps (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_fmsubadd_ps (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps xmm, xmm, xmm
             vfmsubadd213ps xmm, xmm, xmm
             vfmsubadd231ps xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m128 _mm_mask_fmsubadd_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)

Synopsis

__m128 _mm_mask_fmsubadd_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps
             vfmsubadd213ps
             vfmsubadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m128 _mm_mask3_fmsubadd_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)

Synopsis

__m128 _mm_mask3_fmsubadd_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsubadd132ps
             vfmsubadd213ps
             vfmsubadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m128 _mm_maskz_fmsubadd_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_maskz_fmsubadd_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps
             vfmsubadd213ps
             vfmsubadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m256 _mm256_fmsubadd_ps (__m256 a, __m256 b, __m256 c)

Synopsis

__m256 _mm256_fmsubadd_ps (__m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps ymm, ymm, ymm
             vfmsubadd213ps ymm, ymm, ymm
             vfmsubadd231ps ymm, ymm, ymm
CPUID Flags: FMA

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m256 _mm256_mask_fmsubadd_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)

Synopsis

__m256 _mm256_mask_fmsubadd_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps
             vfmsubadd213ps
             vfmsubadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m256 _mm256_mask3_fmsubadd_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)

Synopsis

__m256 _mm256_mask3_fmsubadd_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsubadd132ps
             vfmsubadd213ps
             vfmsubadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m256 _mm256_maskz_fmsubadd_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)

Synopsis

__m256 _mm256_maskz_fmsubadd_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps
             vfmsubadd213ps
             vfmsubadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_fmsubadd_ps (__m512 a, __m512 b, __m512 c)

Synopsis

__m512 _mm512_fmsubadd_ps (__m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm
             vfmsubadd213ps zmm {k}, zmm, zmm
             vfmsubadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_mask_fmsubadd_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)

Synopsis

__m512 _mm512_mask_fmsubadd_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm
             vfmsubadd213ps zmm {k}, zmm, zmm
             vfmsubadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_mask3_fmsubadd_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)

Synopsis

__m512 _mm512_mask3_fmsubadd_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm
             vfmsubadd213ps zmm {k}, zmm, zmm
             vfmsubadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_maskz_fmsubadd_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)

Synopsis

__m512 _mm512_maskz_fmsubadd_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm
             vfmsubadd213ps zmm {k}, zmm, zmm
             vfmsubadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_fmsubadd_round_pd (__m512d a, __m512d b, __m512d c, const int rounding)

Synopsis

__m512d _mm512_fmsubadd_round_pd (__m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm {er}
             vfmsubadd213pd zmm {k}, zmm, zmm {er}
             vfmsubadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_mask_fmsubadd_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, const int rounding)

Synopsis

__m512d _mm512_mask_fmsubadd_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm {er}
             vfmsubadd213pd zmm {k}, zmm, zmm {er}
             vfmsubadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_mask3_fmsubadd_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, const int rounding)

Synopsis

__m512d _mm512_mask3_fmsubadd_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm {er}
             vfmsubadd213pd zmm {k}, zmm, zmm {er}
             vfmsubadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_maskz_fmsubadd_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)

Synopsis

__m512d _mm512_maskz_fmsubadd_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm {er}
             vfmsubadd213pd zmm {k}, zmm, zmm {er}
             vfmsubadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_fmsubadd_round_ps (__m512 a, __m512 b, __m512 c, const int rounding)

Synopsis

__m512 _mm512_fmsubadd_round_ps (__m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm {er}
             vfmsubadd213ps zmm {k}, zmm, zmm {er}
             vfmsubadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_mask_fmsubadd_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, const int rounding)

Synopsis

__m512 _mm512_mask_fmsubadd_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm {er}
             vfmsubadd213ps zmm {k}, zmm, zmm {er}
             vfmsubadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_mask3_fmsubadd_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, const int rounding)

Synopsis

__m512 _mm512_mask3_fmsubadd_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm {er}
             vfmsubadd213ps zmm {k}, zmm, zmm {er}
             vfmsubadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_maskz_fmsubadd_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)

Synopsis

__m512 _mm512_maskz_fmsubadd_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm {er}
             vfmsubadd213ps zmm {k}, zmm, zmm {er}
             vfmsubadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m128d _mm_fnmadd_pd (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_fnmadd_pd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmadd132pd xmm, xmm, xmm
             vfnmadd213pd xmm, xmm, xmm
             vfnmadd231pd xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m128d _mm_mask_fnmadd_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)

Synopsis

__m128d _mm_mask_fnmadd_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmadd132pd
             vfnmadd213pd
             vfnmadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m128d _mm_mask3_fnmadd_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)

Synopsis

__m128d _mm_mask3_fnmadd_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132pd
             vfnmadd213pd
             vfnmadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m128d _mm_maskz_fnmadd_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_maskz_fnmadd_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmadd132pd
             vfnmadd213pd
             vfnmadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m256d _mm256_fnmadd_pd (__m256d a, __m256d b, __m256d c)

Synopsis

__m256d _mm256_fnmadd_pd (__m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfnmadd132pd ymm, ymm, ymm
             vfnmadd213pd ymm, ymm, ymm
             vfnmadd231pd ymm, ymm, ymm
CPUID Flags: FMA

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m256d _mm256_mask_fnmadd_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)

Synopsis

__m256d _mm256_mask_fnmadd_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfnmadd132pd
             vfnmadd213pd
             vfnmadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m256d _mm256_mask3_fnmadd_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)

Synopsis

__m256d _mm256_mask3_fnmadd_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132pd
             vfnmadd213pd
             vfnmadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m256d _mm256_maskz_fnmadd_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)

Synopsis

__m256d _mm256_maskz_fnmadd_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfnmadd132pd
             vfnmadd213pd
             vfnmadd231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_fnmadd_pd (__m512d a, __m512d b, __m512d c)

Synopsis

__m512d _mm512_fnmadd_pd (__m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm
             vfnmadd213pd zmm {k}, zmm, zmm
             vfnmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ENDFOR dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_mask_fnmadd_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)

Synopsis

__m512d _mm512_mask_fnmadd_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm
             vfnmadd213pd zmm {k}, zmm, zmm
             vfnmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_mask3_fnmadd_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)

Synopsis

__m512d _mm512_mask3_fnmadd_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm
             vfnmadd213pd zmm {k}, zmm, zmm
             vfnmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_maskz_fnmadd_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)

Synopsis

__m512d _mm512_maskz_fnmadd_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm
             vfnmadd213pd zmm {k}, zmm, zmm
             vfnmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m128 _mm_fnmadd_ps (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_fnmadd_ps (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmadd132ps xmm, xmm, xmm
             vfnmadd213ps xmm, xmm, xmm
             vfnmadd231ps xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 a[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m128 _mm_mask_fnmadd_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)

Synopsis

__m128 _mm_mask_fnmadd_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmadd132ps
             vfnmadd213ps
             vfnmadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m128 _mm_mask3_fnmadd_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)

Synopsis

__m128 _mm_mask3_fnmadd_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132ps
             vfnmadd213ps
             vfnmadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m128 _mm_maskz_fnmadd_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_maskz_fnmadd_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmadd132ps
             vfnmadd213ps
             vfnmadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m256 _mm256_fnmadd_ps (__m256 a, __m256 b, __m256 c)

Synopsis

__m256 _mm256_fnmadd_ps (__m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfnmadd132ps ymm, ymm, ymm
             vfnmadd213ps ymm, ymm, ymm
             vfnmadd231ps ymm, ymm, ymm
CPUID Flags: FMA

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 a[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m256 _mm256_mask_fnmadd_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)

Synopsis

__m256 _mm256_mask_fnmadd_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfnmadd132ps
             vfnmadd213ps
             vfnmadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m256 _mm256_mask3_fnmadd_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)

Synopsis

__m256 _mm256_mask3_fnmadd_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132ps
             vfnmadd213ps
             vfnmadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m256 _mm256_maskz_fnmadd_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)

Synopsis

__m256 _mm256_maskz_fnmadd_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfnmadd132ps
             vfnmadd213ps
             vfnmadd231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_fnmadd_ps (__m512 a, __m512 b, __m512 c)

Synopsis

__m512 _mm512_fnmadd_ps (__m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm
             vfnmadd213ps zmm {k}, zmm, zmm
             vfnmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 a[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ENDFOR dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_mask_fnmadd_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)

Synopsis

__m512 _mm512_mask_fnmadd_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm
             vfnmadd213ps zmm {k}, zmm, zmm
             vfnmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_mask3_fnmadd_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)

Synopsis

__m512 _mm512_mask3_fnmadd_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm
             vfnmadd213ps zmm {k}, zmm, zmm
             vfnmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_maskz_fnmadd_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)

Synopsis

__m512 _mm512_maskz_fnmadd_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm
             vfnmadd213ps zmm {k}, zmm, zmm
             vfnmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_fnmadd_round_pd (__m512d a, __m512d b, __m512d c, int rounding)

Synopsis

__m512d _mm512_fnmadd_round_pd (__m512d a, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm {er}
             vfnmadd213pd zmm {k}, zmm, zmm {er}
             vfnmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ENDFOR dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_mask_fnmadd_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)

Synopsis

__m512d _mm512_mask_fnmadd_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm {er}
             vfnmadd213pd zmm {k}, zmm, zmm {er}
             vfnmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_mask3_fnmadd_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)

Synopsis

__m512d _mm512_mask3_fnmadd_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm {er}
             vfnmadd213pd zmm {k}, zmm, zmm {er}
             vfnmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_maskz_fnmadd_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)

Synopsis

__m512d _mm512_maskz_fnmadd_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm {er}
             vfnmadd213pd zmm {k}, zmm, zmm {er}
             vfnmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_fnmadd_round_ps (__m512 a, __m512 b, __m512 c, int rounding)

Synopsis

__m512 _mm512_fnmadd_round_ps (__m512 a, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm {er}
             vfnmadd213ps zmm {k}, zmm, zmm {er}
             vfnmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ENDFOR dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_mask_fnmadd_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)

Synopsis

__m512 _mm512_mask_fnmadd_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm {er}
             vfnmadd213ps zmm {k}, zmm, zmm {er}
             vfnmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_mask3_fnmadd_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)

Synopsis

__m512 _mm512_mask3_fnmadd_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm {er}
             vfnmadd213ps zmm {k}, zmm, zmm {er}
             vfnmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_maskz_fnmadd_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)

Synopsis

__m512 _mm512_maskz_fnmadd_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm {er}
             vfnmadd213ps zmm {k}, zmm, zmm {er}
             vfnmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfnmadd132sd, vfnmadd213sd, vfnmadd231sd
__m128d _mm_mask_fnmadd_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)

Synopsis

__m128d _mm_mask_fnmadd_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132sd xmm {k}, xmm, xmm {er}
             vfnmadd213sd xmm {k}, xmm, xmm {er}
             vfnmadd231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfnmadd132sd, vfnmadd213sd, vfnmadd231sd
__m128d _mm_mask3_fnmadd_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)

Synopsis

__m128d _mm_mask3_fnmadd_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132sd xmm {k}, xmm, xmm {er}
             vfnmadd213sd xmm {k}, xmm, xmm {er}
             vfnmadd231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfnmadd132sd, vfnmadd213sd, vfnmadd231sd
__m128d _mm_maskz_fnmadd_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)

Synopsis

__m128d _mm_maskz_fnmadd_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132sd xmm {k}, xmm, xmm {er}
             vfnmadd213sd xmm {k}, xmm, xmm {er}
             vfnmadd231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_mask_fnmadd_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)

Synopsis

__m128 _mm_mask_fnmadd_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm {k}, xmm, xmm {er}
             vfnmadd213ss xmm {k}, xmm, xmm {er}
             vfnmadd231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_mask3_fnmadd_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)

Synopsis

__m128 _mm_mask3_fnmadd_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm {k}, xmm, xmm {er}
             vfnmadd213ss xmm {k}, xmm, xmm {er}
             vfnmadd231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_maskz_fnmadd_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)

Synopsis

__m128 _mm_maskz_fnmadd_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm {k}, xmm, xmm {er}
             vfnmadd213ss xmm {k}, xmm, xmm {er}
             vfnmadd231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfnmadd132sd, vfnmadd213sd, vfnmadd231sd
__m128d _mm_fnmadd_sd (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_fnmadd_sd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmadd132sd xmm, xmm, xmm
             vfnmadd213sd xmm, xmm, xmm
             vfnmadd231sd xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfnmadd132sd, vfnmadd213sd, vfnmadd231sd
__m128d _mm_mask_fnmadd_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)

Synopsis

__m128d _mm_mask_fnmadd_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmadd132sd xmm {k}, xmm, xmm
             vfnmadd213sd xmm {k}, xmm, xmm
             vfnmadd231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfnmadd132sd, vfnmadd213sd, vfnmadd231sd
__m128d _mm_mask3_fnmadd_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)

Synopsis

__m128d _mm_mask3_fnmadd_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132sd xmm {k}, xmm, xmm
             vfnmadd213sd xmm {k}, xmm, xmm
             vfnmadd231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfnmadd213sd, vfnmadd231sd, vfnmadd132sd
__m128d _mm_maskz_fnmadd_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_maskz_fnmadd_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmadd213sd xmm {k}, xmm, xmm
             vfnmadd231sd xmm {k}, xmm, xmm
             vfnmadd132sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_fnmadd_ss (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_fnmadd_ss (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm, xmm, xmm
             vfnmadd213ss xmm, xmm, xmm
             vfnmadd231ss xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_mask_fnmadd_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)

Synopsis

__m128 _mm_mask_fnmadd_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm {k}, xmm, xmm
             vfnmadd213ss xmm {k}, xmm, xmm
             vfnmadd231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_mask3_fnmadd_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)

Synopsis

__m128 _mm_mask3_fnmadd_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm {k}, xmm, xmm
             vfnmadd213ss xmm {k}, xmm, xmm
             vfnmadd231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_maskz_fnmadd_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_maskz_fnmadd_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm {k}, xmm, xmm
             vfnmadd213ss xmm {k}, xmm, xmm
             vfnmadd231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m128d _mm_fnmsub_pd (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_fnmsub_pd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmsub132pd xmm, xmm, xmm
             vfnmsub213pd xmm, xmm, xmm
             vfnmsub231pd xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m128d _mm_mask_fnmsub_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)

Synopsis

__m128d _mm_mask_fnmsub_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmsub132pd
             vfnmsub213pd
             vfnmsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m128d _mm_mask3_fnmsub_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)

Synopsis

__m128d _mm_mask3_fnmsub_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132pd
             vfnmsub213pd
             vfnmsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m128d _mm_maskz_fnmsub_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_maskz_fnmsub_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmsub132pd
             vfnmsub213pd
             vfnmsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m256d _mm256_fnmsub_pd (__m256d a, __m256d b, __m256d c)

Synopsis

__m256d _mm256_fnmsub_pd (__m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfnmsub132pd ymm, ymm, ymm
             vfnmsub213pd ymm, ymm, ymm
             vfnmsub231pd ymm, ymm, ymm
CPUID Flags: FMA

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m256d _mm256_mask_fnmsub_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)

Synopsis

__m256d _mm256_mask_fnmsub_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfnmsub132pd
             vfnmsub213pd
             vfnmsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m256d _mm256_mask3_fnmsub_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)

Synopsis

__m256d _mm256_mask3_fnmsub_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132pd
             vfnmsub213pd
             vfnmsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m256d _mm256_maskz_fnmsub_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)

Synopsis

__m256d _mm256_maskz_fnmsub_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfnmsub132pd
             vfnmsub213pd
             vfnmsub231pd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_fnmsub_pd (__m512d a, __m512d b, __m512d c)

Synopsis

__m512d _mm512_fnmsub_pd (__m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm
             vfnmsub213pd zmm {k}, zmm, zmm
             vfnmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ENDFOR dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_mask_fnmsub_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)

Synopsis

__m512d _mm512_mask_fnmsub_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm
             vfnmsub213pd zmm {k}, zmm, zmm
             vfnmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_mask3_fnmsub_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)

Synopsis

__m512d _mm512_mask3_fnmsub_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm
             vfnmsub213pd zmm {k}, zmm, zmm
             vfnmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_maskz_fnmsub_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)

Synopsis

__m512d _mm512_maskz_fnmsub_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm
             vfnmsub213pd zmm {k}, zmm, zmm
             vfnmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m128 _mm_fnmsub_ps (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_fnmsub_ps (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmsub132ps xmm, xmm, xmm
             vfnmsub213ps xmm, xmm, xmm
             vfnmsub231ps xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m128 _mm_mask_fnmsub_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)

Synopsis

__m128 _mm_mask_fnmsub_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmsub132ps
             vfnmsub213ps
             vfnmsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m128 _mm_mask3_fnmsub_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)

Synopsis

__m128 _mm_mask3_fnmsub_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132ps
             vfnmsub213ps
             vfnmsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m128 _mm_maskz_fnmsub_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_maskz_fnmsub_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmsub132ps
             vfnmsub213ps
             vfnmsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m256 _mm256_fnmsub_ps (__m256 a, __m256 b, __m256 c)

Synopsis

__m256 _mm256_fnmsub_ps (__m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfnmsub132ps ymm, ymm, ymm
             vfnmsub213ps ymm, ymm, ymm
             vfnmsub231ps ymm, ymm, ymm
CPUID Flags: FMA

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m256 _mm256_mask_fnmsub_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)

Synopsis

__m256 _mm256_mask_fnmsub_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfnmsub132ps
             vfnmsub213ps
             vfnmsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m256 _mm256_mask3_fnmsub_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)

Synopsis

__m256 _mm256_mask3_fnmsub_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132ps
             vfnmsub213ps
             vfnmsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m256 _mm256_maskz_fnmsub_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)

Synopsis

__m256 _mm256_maskz_fnmsub_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfnmsub132ps
             vfnmsub213ps
             vfnmsub231ps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_fnmsub_ps (__m512 a, __m512 b, __m512 c)

Synopsis

__m512 _mm512_fnmsub_ps (__m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm
             vfnmsub213ps zmm {k}, zmm, zmm
             vfnmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ENDFOR dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_mask_fnmsub_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)

Synopsis

__m512 _mm512_mask_fnmsub_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm
             vfnmsub213ps zmm {k}, zmm, zmm
             vfnmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_mask3_fnmsub_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)

Synopsis

__m512 _mm512_mask3_fnmsub_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm
             vfnmsub213ps zmm {k}, zmm, zmm
             vfnmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_maskz_fnmsub_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)

Synopsis

__m512 _mm512_maskz_fnmsub_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm
             vfnmsub213ps zmm {k}, zmm, zmm
             vfnmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_fnmsub_round_pd (__m512d a, __m512d b, __m512d c, int rounding)

Synopsis

__m512d _mm512_fnmsub_round_pd (__m512d a, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm {er}
             vfnmsub213pd zmm {k}, zmm, zmm {er}
             vfnmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ENDFOR dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_mask_fnmsub_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)

Synopsis

__m512d _mm512_mask_fnmsub_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm {er}
             vfnmsub213pd zmm {k}, zmm, zmm {er}
             vfnmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_mask3_fnmsub_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)

Synopsis

__m512d _mm512_mask3_fnmsub_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm {er}
             vfnmsub213pd zmm {k}, zmm, zmm {er}
             vfnmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_maskz_fnmsub_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)

Synopsis

__m512d _mm512_maskz_fnmsub_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm {er}
             vfnmsub213pd zmm {k}, zmm, zmm {er}
             vfnmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_fnmsub_round_ps (__m512 a, __m512 b, __m512 c, int rounding)

Synopsis

__m512 _mm512_fnmsub_round_ps (__m512 a, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm {er}
             vfnmsub213ps zmm {k}, zmm, zmm {er}
             vfnmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ENDFOR dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_mask_fnmsub_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)

Synopsis

__m512 _mm512_mask_fnmsub_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm {er}
             vfnmsub213ps zmm {k}, zmm, zmm {er}
             vfnmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_mask3_fnmsub_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)

Synopsis

__m512 _mm512_mask3_fnmsub_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm {er}
             vfnmsub213ps zmm {k}, zmm, zmm {er}
             vfnmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_maskz_fnmsub_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)

Synopsis

__m512 _mm512_maskz_fnmsub_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm {er}
             vfnmsub213ps zmm {k}, zmm, zmm {er}
             vfnmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_mask_fnmsub_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)

Synopsis

__m128d _mm_mask_fnmsub_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm {k}, xmm, xmm {er}
             vfnmsub213sd xmm {k}, xmm, xmm {er}
             vfnmsub231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_mask3_fnmsub_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)

Synopsis

__m128d _mm_mask3_fnmsub_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm {k}, xmm, xmm {er}
             vfnmsub213sd xmm {k}, xmm, xmm {er}
             vfnmsub231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_maskz_fnmsub_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)

Synopsis

__m128d _mm_maskz_fnmsub_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm {k}, xmm, xmm {er}
             vfnmsub213sd xmm {k}, xmm, xmm {er}
             vfnmsub231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_mask_fnmsub_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)

Synopsis

__m128 _mm_mask_fnmsub_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm {k}, xmm, xmm {er}
             vfnmsub213ss xmm {k}, xmm, xmm {er}
             vfnmsub231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_mask3_fnmsub_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)

Synopsis

__m128 _mm_mask3_fnmsub_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm {k}, xmm, xmm {er}
             vfnmsub213ss xmm {k}, xmm, xmm {er}
             vfnmsub231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_maskz_fnmsub_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)

Synopsis

__m128 _mm_maskz_fnmsub_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm {k}, xmm, xmm {er}
             vfnmsub213ss xmm {k}, xmm, xmm {er}
             vfnmsub231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_fnmsub_sd (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_fnmsub_sd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm, xmm, xmm
             vfnmsub213sd xmm, xmm, xmm
             vfnmsub231sd xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_mask_fnmsub_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)

Synopsis

__m128d _mm_mask_fnmsub_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm {k}, xmm, xmm
             vfnmsub213sd xmm {k}, xmm, xmm
             vfnmsub231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_mask3_fnmsub_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)

Synopsis

__m128d _mm_mask3_fnmsub_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm {k}, xmm, xmm
             vfnmsub213sd xmm {k}, xmm, xmm
             vfnmsub231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_maskz_fnmsub_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)

Synopsis

__m128d _mm_maskz_fnmsub_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm {k}, xmm, xmm
             vfnmsub213sd xmm {k}, xmm, xmm
             vfnmsub231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_fnmsub_ss (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_fnmsub_ss (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm, xmm, xmm
             vfnmsub213ss xmm, xmm, xmm
             vfnmsub231ss xmm, xmm, xmm
CPUID Flags: FMA

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_mask_fnmsub_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)

Synopsis

__m128 _mm_mask_fnmsub_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm {k}, xmm, xmm
             vfnmsub213ss xmm {k}, xmm, xmm
             vfnmsub231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_mask3_fnmsub_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)

Synopsis

__m128 _mm_mask3_fnmsub_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm {k}, xmm, xmm
             vfnmsub213ss xmm {k}, xmm, xmm
             vfnmsub231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst using writemask k (elements are copied from c when the corresponding mask bit is not set).

Operation

IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_maskz_fnmsub_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)

Synopsis

__m128 _mm_maskz_fnmsub_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm {k}, xmm, xmm
             vfnmsub213ss xmm {k}, xmm, xmm
             vfnmsub231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vfpclasspd
__mmask8 _mm_fpclass_pd_mask (__m128d a, int imm8)

Synopsis

__mmask8 _mm_fpclass_pd_mask (__m128d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasspd
CPUID Flags: AVX512VL + AVX512DQ

Description

Test packed double-precision (64-bit) floating-point elements in a for special categories specified by imm8, and store the results in mask vector k.
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

FOR j := 0 to 1 i := j*64 k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) ENDFOR k[MAX:2] := 0
vfpclasspd
__mmask8 _mm_mask_fpclass_pd_mask (__mmask8 k1, __m128d a, int imm8)

Synopsis

__mmask8 _mm_mask_fpclass_pd_mask (__mmask8 k1, __m128d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasspd
CPUID Flags: AVX512VL + AVX512DQ

Description

Test packed double-precision (64-bit) floating-point elements in a for special categories specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vfpclasspd
__mmask8 _mm256_fpclass_pd_mask (__m256d a, int imm8)

Synopsis

__mmask8 _mm256_fpclass_pd_mask (__m256d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasspd
CPUID Flags: AVX512VL + AVX512DQ

Description

Test packed double-precision (64-bit) floating-point elements in a for special categories specified by imm8, and store the results in mask vector k.
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

FOR j := 0 to 3 i := j*64 k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) ENDFOR k[MAX:4] := 0
vfpclasspd
__mmask8 _mm256_mask_fpclass_pd_mask (__mmask8 k1, __m256d a, int imm8)

Synopsis

__mmask8 _mm256_mask_fpclass_pd_mask (__mmask8 k1, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasspd
CPUID Flags: AVX512VL + AVX512DQ

Description

Test packed double-precision (64-bit) floating-point elements in a for special categories specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vfpclasspd
__mmask8 _mm512_fpclass_pd_mask (__m512d a, int imm8)

Synopsis

__mmask8 _mm512_fpclass_pd_mask (__m512d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasspd
CPUID Flags: AVX512DQ

Description

Test packed double-precision (64-bit) floating-point elements in a for special categories specified by imm8, and store the results in mask vector k.
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

FOR j := 0 to 7 i := j*64 k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) ENDFOR k[MAX:8] := 0
vfpclasspd
__mmask8 _mm512_mask_fpclass_pd_mask (__mmask8 k1, __m512d a, int imm8)

Synopsis

__mmask8 _mm512_mask_fpclass_pd_mask (__mmask8 k1, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasspd
CPUID Flags: AVX512DQ

Description

Test packed double-precision (64-bit) floating-point elements in a for special categories specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vfpclassps
__mmask8 _mm_fpclass_ps_mask (__m128 a, int imm8)

Synopsis

__mmask8 _mm_fpclass_ps_mask (__m128 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassps
CPUID Flags: AVX512VL + AVX512DQ

Description

Test packed single-precision (32-bit) floating-point elements in a for special categories specified by imm8, and store the results in mask vector k.
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

FOR j := 0 to 3 i := j*32 k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) ENDFOR k[MAX:4] := 0
vfpclassps
__mmask8 _mm_mask_fpclass_ps_mask (__mmask8 k1, __m128 a, int imm8)

Synopsis

__mmask8 _mm_mask_fpclass_ps_mask (__mmask8 k1, __m128 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassps
CPUID Flags: AVX512VL + AVX512DQ

Description

Test packed single-precision (32-bit) floating-point elements in a for special categories specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vfpclassps
__mmask8 _mm256_fpclass_ps_mask (__m256 a, int imm8)

Synopsis

__mmask8 _mm256_fpclass_ps_mask (__m256 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassps
CPUID Flags: AVX512VL + AVX512DQ

Description

Test packed single-precision (32-bit) floating-point elements in a for special categories specified by imm8, and store the results in mask vector k.
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

FOR j := 0 to 7 i := j*32 k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) ENDFOR k[MAX:8] := 0
vfpclassps
__mmask8 _mm256_mask_fpclass_ps_mask (__mmask8 k1, __m256 a, int imm8)

Synopsis

__mmask8 _mm256_mask_fpclass_ps_mask (__mmask8 k1, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassps
CPUID Flags: AVX512VL + AVX512DQ

Description

Test packed single-precision (32-bit) floating-point elements in a for special categories specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vfpclassps
__mmask16 _mm512_fpclass_ps_mask (__m512 a, int imm8)

Synopsis

__mmask16 _mm512_fpclass_ps_mask (__m512 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassps
CPUID Flags: AVX512DQ

Description

Test packed single-precision (32-bit) floating-point elements in a for special categories specified by imm8, and store the results in mask vector k.
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

FOR j := 0 to 15 i := j*32 k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) ENDFOR k[MAX:16] := 0
vfpclassps
__mmask16 _mm512_mask_fpclass_ps_mask (__mmask16 k1, __m512 a, int imm8)

Synopsis

__mmask16 _mm512_mask_fpclass_ps_mask (__mmask16 k1, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassps
CPUID Flags: AVX512DQ

Description

Test packed single-precision (32-bit) floating-point elements in a for special categories specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vfpclasssd
__mmask8 _mm_fpclass_sd_mask (__m128d a, int imm8)

Synopsis

__mmask8 _mm_fpclass_sd_mask (__m128d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasssd
CPUID Flags: AVX512DQ

Description

Test the lower double-precision (64-bit) floating-point element in a for special categories specified by imm8, and store the result in mask vector k.
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0]) k[MAX:1] := 0
vfpclasssd
__mmask8 _mm_mask_fpclass_sd_mask (__mmask8 k1, __m128d a, int imm8)

Synopsis

__mmask8 _mm_mask_fpclass_sd_mask (__mmask8 k1, __m128d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasssd
CPUID Flags: AVX512DQ

Description

Test the lower double-precision (64-bit) floating-point element in a for special categories specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

IF k1[0] k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0]) ELSE k[0] := 0 FI k[MAX:1] := 0
vfpclassss
__mmask8 _mm_fpclass_ss_mask (__m128 a, int imm8)

Synopsis

__mmask8 _mm_fpclass_ss_mask (__m128 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassss
CPUID Flags: AVX512DQ

Description

Test the lower single-precision (32-bit) floating-point element in a for special categories specified by imm8, and store the result in mask vector k.
imm" can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0]) k[MAX:1] := 0
vfpclassss
__mmask8 _mm_mask_fpclass_ss_mask (__mmask8 k1, __m128 a, int imm8)

Synopsis

__mmask8 _mm_mask_fpclass_ss_mask (__mmask8 k1, __m128 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassss
CPUID Flags: AVX512DQ

Description

Test the lower single-precision (32-bit) floating-point element in a for special categories specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
imm can be a combination of:
0x01 // QNaN 0x02 // Positive Zero 0x04 // Negative Zero 0x08 // Positive Infinity 0x10 // Negative Infinity 0x20 // Denormal 0x40 // Negative 0x80 // SNaN

Operation

IF k1[0] k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0]) ELSE k[0] := 0 FI k[MAX:1] := 0
void _mm_free (void * mem_addr)

Synopsis

void _mm_free (void * mem_addr)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Free aligned memory that was allocated with _mm_malloc.
fxrstor
void _fxrstor (void * mem_addr)

Synopsis

void _fxrstor (void * mem_addr)
#include "immintrin.h"
Instruction: fxrstor MEMmfpxenv
CPUID Flags: FXSR

Description

Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at mem_addr. This data should have been written to memory previously using the FXSAVE instruction, and in the same format as required by the operating mode. mem_addr must be aligned on a 16-byte boundary.

Operation

(x87 FPU, MMX, XMM7-XMM0, MXCSR) := Load(MEM[mem_addr])
fxrstor64
void _fxrstor64 (void * mem_addr)

Synopsis

void _fxrstor64 (void * mem_addr)
#include "immintrin.h"
Instruction: fxrstor64 MEMmfpxenv
CPUID Flags: FXSR

Description

Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at mem_addr. This data should have been written to memory previously using the FXSAVE64 instruction, and in the same format as required by the operating mode. mem_addr must be aligned on a 16-byte boundary.

Operation

(x87 FPU, MMX, XMM7-XMM0, MXCSR) := Load(MEM[mem_addr])
fxsave
void _fxsave (void * mem_addr)

Synopsis

void _fxsave (void * mem_addr)
#include "immintrin.h"
Instruction: fxsave MEMmfpxenv
CPUID Flags: FXSR

Description

Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at mem_addr. The clayout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor.

Operation

MEM[mem_addr+511*8:mem_addr] := Fxsave(x87 FPU, MMX, XMM7-XMM0, MXCSR)
fxsave64
void _fxsave64 (void * mem_addr)

Synopsis

void _fxsave64 (void * mem_addr)
#include "immintrin.h"
Instruction: fxsave64 MEMmfpxenv
CPUID Flags: FXSR

Description

Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at mem_addr. The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor.

Operation

MEM[mem_addr+511*8:mem_addr] := Fxsave64(x87 FPU, MMX, XMM7-XMM0, MXCSR)
unsigned int _MM_GET_EXCEPTION_MASK ()

Synopsis

unsigned int _MM_GET_EXCEPTION_MASK ()
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Macro: Get the exception mask bits from the MXCSR control and status register. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT

Operation

dst[31:0] := MXCSR & _MM_MASK_MASK
unsigned int _MM_GET_EXCEPTION_STATE ()

Synopsis

unsigned int _MM_GET_EXCEPTION_STATE ()
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Macro: Get the exception state bits from the MXCSR control and status register. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT

Operation

dst[31:0] := MXCSR & _MM_EXCEPT_MASK
unsigned int _MM_GET_FLUSH_ZERO_MODE ()

Synopsis

unsigned int _MM_GET_FLUSH_ZERO_MODE ()
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Macro: Get the flush zero bits from the MXCSR control and status register. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF

Operation

dst[31:0] := MXCSR & _MM_FLUSH_MASK
unsigned int _MM_GET_ROUNDING_MODE ()

Synopsis

unsigned int _MM_GET_ROUNDING_MODE ()
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Macro: Get the rounding mode bits from the MXCSR control and status register. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO

Operation

dst[31:0] := MXCSR & _MM_ROUND_MASK
stmxcsr
unsigned int _mm_getcsr (void)

Synopsis

unsigned int _mm_getcsr (void)
#include "xmmintrin.h"
Instruction: stmxcsr MEMd
CPUID Flags: SSE

Description

Get the unsigned 32-bit value of the MXCSR control and status register.

Operation

dst[31:0] := MXCSR

Performance

ArchitectureLatencyThroughput
Haswell4-
Ivy Bridge2-
Sandy Bridge2-
Westmere1-
Nehalem1-
vgetexppd
__m128d _mm_getexp_pd (__m128d a)

Synopsis

__m128d _mm_getexp_pd (__m128d a)
#include "immintrin.h"
Instruction: vgetexppd
CPUID Flags: AVX512VL + AVX512F

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:128] := 0
vgetexppd
__m128d _mm_mask_getexp_pd (__m128d src, __mmask8 k, __m128d a)

Synopsis

__m128d _mm_mask_getexp_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vgetexppd
CPUID Flags: AVX512VL + AVX512F

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vgetexppd
__m128d _mm_maskz_getexp_pd (__mmask8 k, __m128d a)

Synopsis

__m128d _mm_maskz_getexp_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vgetexppd
CPUID Flags: AVX512VL + AVX512F

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vgetexppd
__m256d _mm256_getexp_pd (__m256d a)

Synopsis

__m256d _mm256_getexp_pd (__m256d a)
#include "immintrin.h"
Instruction: vgetexppd
CPUID Flags: AVX512VL + AVX512F

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:256] := 0
vgetexppd
__m256d _mm256_mask_getexp_pd (__m256d src, __mmask8 k, __m256d a)

Synopsis

__m256d _mm256_mask_getexp_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vgetexppd
CPUID Flags: AVX512VL + AVX512F

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vgetexppd
__m256d _mm256_maskz_getexp_pd (__mmask8 k, __m256d a)

Synopsis

__m256d _mm256_maskz_getexp_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vgetexppd
CPUID Flags: AVX512VL + AVX512F

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vgetexppd
__m512d _mm512_getexp_pd (__m512d a)

Synopsis

__m512d _mm512_getexp_pd (__m512d a)
#include "immintrin.h"
Instruction: vgetexppd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vgetexppd
__m512d _mm512_mask_getexp_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_getexp_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vgetexppd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vgetexppd
__m512d _mm512_maskz_getexp_pd (__mmask8 k, __m512d a)

Synopsis

__m512d _mm512_maskz_getexp_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vgetexppd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vgetexpps
__m128 _mm_getexp_ps (__m128 a)

Synopsis

__m128 _mm_getexp_ps (__m128 a)
#include "immintrin.h"
Instruction: vgetexpps
CPUID Flags: AVX512VL + AVX512F

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:128] := 0
vgetexpps
__m128 _mm_mask_getexp_ps (__m128 src, __mmask8 k, __m128 a)

Synopsis

__m128 _mm_mask_getexp_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vgetexpps
CPUID Flags: AVX512VL + AVX512F

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vgetexpps
__m128 _mm_maskz_getexp_ps (__mmask8 k, __m128 a)

Synopsis

__m128 _mm_maskz_getexp_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vgetexpps
CPUID Flags: AVX512VL + AVX512F

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vgetexpps
__m256 _mm256_getexp_ps (__m256 a)

Synopsis

__m256 _mm256_getexp_ps (__m256 a)
#include "immintrin.h"
Instruction: vgetexpps
CPUID Flags: AVX512VL + AVX512F

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:256] := 0
vgetexpps
__m256 _mm256_mask_getexp_ps (__m256 src, __mmask8 k, __m256 a)

Synopsis

__m256 _mm256_mask_getexp_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vgetexpps
CPUID Flags: AVX512VL + AVX512F

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vgetexpps
__m256 _mm256_maskz_getexp_ps (__mmask8 k, __m256 a)

Synopsis

__m256 _mm256_maskz_getexp_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vgetexpps
CPUID Flags: AVX512VL + AVX512F

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vgetexpps
__m512 _mm512_getexp_ps (__m512 a)

Synopsis

__m512 _mm512_getexp_ps (__m512 a)
#include "immintrin.h"
Instruction: vgetexpps zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vgetexpps
__m512 _mm512_mask_getexp_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_getexp_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vgetexpps zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vgetexpps
__m512 _mm512_maskz_getexp_ps (__mmask16 k, __m512 a)

Synopsis

__m512 _mm512_maskz_getexp_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vgetexpps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vgetexppd
__m512d _mm512_getexp_round_pd (__m512d a, int rounding)

Synopsis

__m512d _mm512_getexp_round_pd (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vgetexppd zmm {k}, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vgetexppd
__m512d _mm512_mask_getexp_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)

Synopsis

__m512d _mm512_mask_getexp_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vgetexppd zmm {k}, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vgetexppd
__m512d _mm512_maskz_getexp_round_pd (__mmask8 k, __m512d a, int rounding)

Synopsis

__m512d _mm512_maskz_getexp_round_pd (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vgetexppd zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vgetexpps
__m512 _mm512_getexp_round_ps (__m512 a, int rounding)

Synopsis

__m512 _mm512_getexp_round_ps (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vgetexpps zmm {k}, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vgetexpps
__m512 _mm512_mask_getexp_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)

Synopsis

__m512 _mm512_mask_getexp_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vgetexpps zmm {k}, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vgetexpps
__m512 _mm512_maskz_getexp_round_ps (__mmask16 k, __m512 a, int rounding)

Synopsis

__m512 _mm512_maskz_getexp_round_ps (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vgetexpps zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vgetexpsd
__m128d _mm_getexp_round_sd (__m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_getexp_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vgetexpsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := ConvertExpFP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vgetexpsd
__m128d _mm_mask_getexp_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_mask_getexp_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vgetexpsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := ConvertExpFP64(b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vgetexpsd
__m128d _mm_maskz_getexp_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_maskz_getexp_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vgetexpsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := ConvertExpFP64(b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vgetexpss
__m128 _mm_getexp_round_ss (__m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_getexp_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vgetexpss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := ConvertExpFP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vgetexpss
__m128 _mm_mask_getexp_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_mask_getexp_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vgetexpss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := ConvertExpFP32(b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vgetexpss
__m128 _mm_maskz_getexp_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_maskz_getexp_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vgetexpss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := ConvertExpFP32(b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vgetexpsd
__m128d _mm_getexp_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_getexp_sd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vgetexpsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.

Operation

dst[63:0] := ConvertExpFP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vgetexpsd
__m128d _mm_mask_getexp_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_getexp_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vgetexpsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.

Operation

IF k[0] dst[63:0] := ConvertExpFP64(b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vgetexpsd
__m128d _mm_maskz_getexp_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_getexp_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vgetexpsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.

Operation

IF k[0] dst[63:0] := ConvertExpFP64(b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vgetexpss
__m128 _mm_getexp_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_getexp_ss (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vgetexpss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.

Operation

dst[31:0] := ConvertExpFP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vgetexpss
__m128 _mm_mask_getexp_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_getexp_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vgetexpss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.

Operation

IF k[0] dst[31:0] := ConvertExpFP32(b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vgetexpss
__m128 _mm_maskz_getexp_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_getexp_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vgetexpss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.

Operation

IF k[0] dst[31:0] := ConvertExpFP32(b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vgetmantpd
__m128d _mm_getmant_pd (__m128d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m128d _mm_getmant_pd (__m128d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd
CPUID Flags: AVX512VL + AVX512F

Description

Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ENDFOR dst[MAX:128] := 0
vgetmantpd
__m128d _mm_mask_getmant_pd (__m128d src, __mmask8 k, __m128d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m128d _mm_mask_getmant_pd (__m128d src, __mmask8 k, __m128d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd
CPUID Flags: AVX512VL + AVX512F

Description

Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vgetmantpd
__m128d _mm_maskz_getmant_pd (__mmask8 k, __m128d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m128d _mm_maskz_getmant_pd (__mmask8 k, __m128d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd
CPUID Flags: AVX512VL + AVX512F

Description

Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vgetmantpd
__m256d _mm256_getmant_pd (__m256d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m256d _mm256_getmant_pd (__m256d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd
CPUID Flags: AVX512VL + AVX512F

Description

Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ENDFOR dst[MAX:256] := 0
vgetmantpd
__m256d _mm256_mask_getmant_pd (__m256d src, __mmask8 k, __m256d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m256d _mm256_mask_getmant_pd (__m256d src, __mmask8 k, __m256d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd
CPUID Flags: AVX512VL + AVX512F

Description

Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vgetmantpd
__m256d _mm256_maskz_getmant_pd (__mmask8 k, __m256d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m256d _mm256_maskz_getmant_pd (__mmask8 k, __m256d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd
CPUID Flags: AVX512VL + AVX512F

Description

Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vgetmantpd
__m512d _mm512_getmant_pd (__m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m512d _mm512_getmant_pd (__m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ENDFOR dst[MAX:512] := 0
vgetmantpd
__m512d _mm512_mask_getmant_pd (__m512d src, __mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m512d _mm512_mask_getmant_pd (__m512d src, __mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vgetmantpd
__m512d _mm512_maskz_getmant_pd (__mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m512d _mm512_maskz_getmant_pd (__mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vgetmantps
__m128 _mm_getmant_ps (__m128 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m128 _mm_getmant_ps (__m128 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps
CPUID Flags: AVX512VL + AVX512F

Description

Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ENDFOR dst[MAX:128] := 0
vgetmantps
__m128 _mm_mask_getmant_ps (__m128 src, __mmask8 k, __m128 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m128 _mm_mask_getmant_ps (__m128 src, __mmask8 k, __m128 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps
CPUID Flags: AVX512VL + AVX512F

Description

Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vgetmantps
__m128 _mm_maskz_getmant_ps (__mmask8 k, __m128 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m128 _mm_maskz_getmant_ps (__mmask8 k, __m128 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps
CPUID Flags: AVX512VL + AVX512F

Description

Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vgetmantps
__m256 _mm256_getmant_ps (__m256 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m256 _mm256_getmant_ps (__m256 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps
CPUID Flags: AVX512VL + AVX512F

Description

Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ENDFOR dst[MAX:256] := 0
vgetmantps
__m256 _mm256_mask_getmant_ps (__m256 src, __mmask8 k, __m256 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m256 _mm256_mask_getmant_ps (__m256 src, __mmask8 k, __m256 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps
CPUID Flags: AVX512VL + AVX512F

Description

Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vgetmantps
__m256 _mm256_maskz_getmant_ps (__mmask8 k, __m256 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m256 _mm256_maskz_getmant_ps (__mmask8 k, __m256 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps
CPUID Flags: AVX512VL + AVX512F

Description

Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vgetmantps
__m512 _mm512_getmant_ps (__m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m512 _mm512_getmant_ps (__m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ENDFOR dst[MAX:512] := 0
vgetmantps
__m512 _mm512_mask_getmant_ps (__m512 src, __mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m512 _mm512_mask_getmant_ps (__m512 src, __mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vgetmantps
__m512 _mm512_maskz_getmant_ps (__mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m512 _mm512_maskz_getmant_ps (__mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vgetmantpd
__m512d _mm512_getmant_round_pd (__m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)

Synopsis

__m512d _mm512_getmant_round_pd (__m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantpd zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ENDFOR dst[MAX:512] := 0
vgetmantpd
__m512d _mm512_mask_getmant_round_pd (__m512d src, __mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)

Synopsis

__m512d _mm512_mask_getmant_round_pd (__m512d src, __mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantpd zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vgetmantpd
__m512d _mm512_maskz_getmant_round_pd (__mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)

Synopsis

__m512d _mm512_maskz_getmant_round_pd (__mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantpd zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F

Description

Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vgetmantps
__m512 _mm512_getmant_round_ps (__m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)

Synopsis

__m512 _mm512_getmant_round_ps (__m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantps zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ENDFOR dst[MAX:512] := 0
vgetmantps
__m512 _mm512_mask_getmant_round_ps (__m512 src, __mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)

Synopsis

__m512 _mm512_mask_getmant_round_ps (__m512 src, __mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantps zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vgetmantps
__m512 _mm512_maskz_getmant_round_ps (__mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)

Synopsis

__m512 _mm512_maskz_getmant_round_ps (__mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantps zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F

Description

Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vgetmantsd
__m128d _mm_getmant_round_sd (__m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)

Synopsis

__m128d _mm_getmant_round_sd (__m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantsd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Normalize the mantissas of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv) dst[127:64] := b[127:64] dst[MAX:128] := 0
vgetmantsd
__m128d _mm_mask_getmant_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)

Synopsis

__m128d _mm_mask_getmant_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantsd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Normalize the mantissas of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vgetmantsd
__m128d _mm_maskz_getmant_round_sd (__mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)

Synopsis

__m128d _mm_maskz_getmant_round_sd (__mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantsd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Normalize the mantissas of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vgetmantss
__m128 _mm_getmant_round_ss (__m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)

Synopsis

__m128 _mm_getmant_round_ss (__m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantss xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Normalize the mantissas of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from b to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv) dst[127:32] := b[127:32] dst[MAX:128] := 0
vgetmantss
__m128 _mm_mask_getmant_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)

Synopsis

__m128 _mm_mask_getmant_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantss xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Normalize the mantissas of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vgetmantss
__m128 _mm_maskz_getmant_round_ss (__mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)

Synopsis

__m128 _mm_maskz_getmant_round_ss (__mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantss xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Normalize the mantissas of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vgetmantsd
__m128d _mm_getmant_sd (__m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m128d _mm_getmant_sd (__m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantsd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Normalize the mantissas of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv) dst[127:64] := b[127:64] dst[MAX:128] := 0
vgetmantsd
__m128d _mm_mask_getmant_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m128d _mm_mask_getmant_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantsd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Normalize the mantissas of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

IF k[0] dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vgetmantsd
__m128d _mm_maskz_getmant_sd (__mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m128d _mm_maskz_getmant_sd (__mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantsd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Normalize the mantissas of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

IF k[0] dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vgetmantss
__m128 _mm_getmant_ss (__m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m128 _mm_getmant_ss (__m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantss xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Normalize the mantissas of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from b to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv) dst[127:32] := b[127:32] dst[MAX:128] := 0
vgetmantss
__m128 _mm_mask_getmant_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m128 _mm_mask_getmant_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantss xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Normalize the mantissas of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

IF k[0] dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vgetmantss
__m128 _mm_maskz_getmant_ss (__mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)

Synopsis

__m128 _mm_maskz_getmant_ss (__mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantss xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Normalize the mantissas of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
The mantissa is normalized to the interval specified by interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1

Operation

IF k[0] dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vgmaxpd
__m512d _mm512_gmax_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_gmax_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vgmaxpd zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Determines the maximum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in a and b, storing the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
vgmaxpd
__m512d _mm512_mask_gmax_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_gmax_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vgmaxpd zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vgmaxps
__m512 _mm512_gmax_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_gmax_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgmaxps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Determines the maximum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
vgmaxps
__m512 _mm512_mask_gmax_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_gmax_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgmaxps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vgmaxabsps
__m512 _mm512_gmaxabs_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_gmaxabs_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgmaxabsps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i])) ENDFOR dst[MAX:512] := 0
vgmaxabsps
__m512 _mm512_mask_gmaxabs_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_gmaxabs_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgmaxabsps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i])) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vgminpd
__m512d _mm512_gmin_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_gmin_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vgminpd zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Determines the minimum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in a and b, storing the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
vgminpd
__m512d _mm512_mask_gmin_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_gmin_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vgminpd zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vgminps
__m512 _mm512_gmin_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_gmin_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgminps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Determines the minimum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
vgminps
__m512 _mm512_mask_gmin_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_gmin_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgminps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
phaddw
__m128i _mm_hadd_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_hadd_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: phaddw xmm, xmm
CPUID Flags: SSSE3

Description

Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the signed 16-bit results in dst.

Operation

dst[15:0] := a[31:16] + a[15:0] dst[31:16] := a[63:48] + a[47:32] dst[47:32] := a[95:80] + a[79:64] dst[63:48] := a[127:112] + a[111:96] dst[79:64] := b[31:16] + b[15:0] dst[95:80] := b[63:48] + b[47:32] dst[111:96] := b[95:80] + b[79:64] dst[127:112] := b[127:112] + b[111:96]

Performance

ArchitectureLatencyThroughput
Haswell31.5
Ivy Bridge31.5
Sandy Bridge31.5
Westmere31.5
Nehalem31.5
vphaddw
__m256i _mm256_hadd_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_hadd_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vphaddw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the signed 16-bit results in dst.

Operation

dst[15:0] := a[31:16] + a[15:0] dst[31:16] := a[63:48] + a[47:32] dst[47:32] := a[95:80] + a[79:64] dst[63:48] := a[127:112] + a[111:96] dst[79:64] := b[31:16] + b[15:0] dst[95:80] := b[63:48] + b[47:32] dst[111:96] := b[95:80] + b[79:64] dst[127:112] := b[127:112] + b[111:96] dst[143:128] := a[159:144] + a[143:128] dst[159:144] := a[191:176] + a[175:160] dst[175:160] := a[223:208] + a[207:192] dst[191:176] := a[255:240] + a[239:224] dst[207:192] := b[127:112] + b[143:128] dst[223:208] := b[159:144] + b[175:160] dst[239:224] := b[191:176] + b[207:192] dst[255:240] := b[223:208] + b[239:224] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell32
phaddd
__m128i _mm_hadd_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_hadd_epi32 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: phaddd xmm, xmm
CPUID Flags: SSSE3

Description

Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the signed 32-bit results in dst.

Operation

dst[31:0] := a[63:32] + a[31:0] dst[63:32] := a[127:96] + a[95:64] dst[95:64] := b[63:32] + b[31:0] dst[127:96] := b[127:96] + b[95:64]

Performance

ArchitectureLatencyThroughput
Haswell31.5
Ivy Bridge31.5
Sandy Bridge31.5
Westmere31.5
Nehalem31.5
vphaddd
__m256i _mm256_hadd_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_hadd_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vphaddd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the signed 32-bit results in dst.

Operation

dst[31:0] := a[63:32] + a[31:0] dst[63:32] := a[127:96] + a[95:64] dst[95:64] := b[63:32] + b[31:0] dst[127:96] := b[127:96] + b[95:64] dst[159:128] := a[191:160] + a[159:128] dst[191:160] := a[255:224] + a[223:192] dst[223:192] := b[191:160] + b[159:128] dst[255:224] := b[255:224] + b[223:192] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell32
haddpd
__m128d _mm_hadd_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_hadd_pd (__m128d a, __m128d b)
#include "pmmintrin.h"
Instruction: haddpd xmm, xmm
CPUID Flags: SSE3

Description

Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the results in dst.

Operation

dst[63:0] := a[127:64] + a[63:0] dst[127:64] := b[127:64] + b[63:0]

Performance

ArchitectureLatencyThroughput
Haswell52
Ivy Bridge52
Sandy Bridge52
Westmere52
Nehalem52
vhaddpd
__m256d _mm256_hadd_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_hadd_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vhaddpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the results in dst.

Operation

dst[63:0] := a[127:64] + a[63:0] dst[127:64] := b[127:64] + b[63:0] dst[191:128] := a[255:192] + a[191:128] dst[255:192] := b[255:192] + b[191:128] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell5-
Ivy Bridge5-
Sandy Bridge5-
phaddw
__m64 _mm_hadd_pi16 (__m64 a, __m64 b)

Synopsis

__m64 _mm_hadd_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: phaddw mm, mm
CPUID Flags: SSSE3

Description

Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the signed 16-bit results in dst.

Operation

dst[15:0] := a[31:16] + a[15:0] dst[31:16] := a[63:48] + a[47:32] dst[47:32] := b[31:16] + b[15:0] dst[63:48] := b[63:48] + b[47:32]
phaddw
__m64 _mm_hadd_pi32 (__m64 a, __m64 b)

Synopsis

__m64 _mm_hadd_pi32 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: phaddw mm, mm
CPUID Flags: SSSE3

Description

Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the signed 32-bit results in dst.

Operation

dst[31:0] := a[63:32] + a[31:0] dst[63:32] := b[63:32] + b[31:0]
haddps
__m128 _mm_hadd_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_hadd_ps (__m128 a, __m128 b)
#include "pmmintrin.h"
Instruction: haddps xmm, xmm
CPUID Flags: SSE3

Description

Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the results in dst.

Operation

dst[31:0] := a[63:32] + a[31:0] dst[63:32] := a[127:96] + a[95:64] dst[95:64] := b[63:32] + b[31:0] dst[127:96] := b[127:96] + b[95:64]

Performance

ArchitectureLatencyThroughput
Haswell52
Ivy Bridge52
Sandy Bridge52
Westmere52
Nehalem52
vhaddps
__m256 _mm256_hadd_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_hadd_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vhaddps ymm, ymm, ymm
CPUID Flags: AVX

Description

Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the results in dst.

Operation

dst[31:0] := a[63:32] + a[31:0] dst[63:32] := a[127:96] + a[95:64] dst[95:64] := b[63:32] + b[31:0] dst[127:96] := b[127:96] + b[95:64] dst[159:128] := a[191:160] + a[159:128] dst[191:160] := a[255:224] + a[223:192] dst[223:192] := b[191:160] + b[159:128] dst[255:224] := b[255:224] + b[223:192] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell5-
Ivy Bridge5-
Sandy Bridge5-
phaddsw
__m128i _mm_hadds_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_hadds_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: phaddsw xmm, xmm
CPUID Flags: SSSE3

Description

Horizontally add adjacent pairs of 16-bit integers in a and b using saturation, and pack the signed 16-bit results in dst.

Operation

dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0]) dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32]) dst[47:32] = Saturate_To_Int16(a[95:80] + a[79:64]) dst[63:48] = Saturate_To_Int16(a[127:112] + a[111:96]) dst[79:64] = Saturate_To_Int16(b[31:16] + b[15:0]) dst[95:80] = Saturate_To_Int16(b[63:48] + b[47:32]) dst[111:96] = Saturate_To_Int16(b[95:80] + b[79:64]) dst[127:112] = Saturate_To_Int16(b[127:112] + b[111:96])

Performance

ArchitectureLatencyThroughput
Haswell31.5
Ivy Bridge31.5
Sandy Bridge31.5
Westmere31.5
Nehalem31.5
vphaddsw
__m256i _mm256_hadds_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_hadds_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vphaddsw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Horizontally add adjacent pairs of 16-bit integers in a and b using saturation, and pack the signed 16-bit results in dst.

Operation

dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0]) dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32]) dst[47:32] = Saturate_To_Int16(a[95:80] + a[79:64]) dst[63:48] = Saturate_To_Int16(a[127:112] + a[111:96]) dst[79:64] = Saturate_To_Int16(b[31:16] + b[15:0]) dst[95:80] = Saturate_To_Int16(b[63:48] + b[47:32]) dst[111:96] = Saturate_To_Int16(b[95:80] + b[79:64]) dst[127:112] = Saturate_To_Int16(b[127:112] + b[111:96]) dst[143:128] = Saturate_To_Int16(a[159:144] + a[143:128]) dst[159:144] = Saturate_To_Int16(a[191:176] + a[175:160]) dst[175:160] = Saturate_To_Int16( a[223:208] + a[207:192]) dst[191:176] = Saturate_To_Int16(a[255:240] + a[239:224]) dst[207:192] = Saturate_To_Int16(b[127:112] + b[143:128]) dst[223:208] = Saturate_To_Int16(b[159:144] + b[175:160]) dst[239:224] = Saturate_To_Int16(b[191-160] + b[159-128]) dst[255:240] = Saturate_To_Int16(b[255:240] + b[239:224]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell32
phaddsw
__m64 _mm_hadds_pi16 (__m64 a, __m64 b)

Synopsis

__m64 _mm_hadds_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: phaddsw mm, mm
CPUID Flags: SSSE3

Description

Horizontally add adjacent pairs of 16-bit integers in a and b using saturation, and pack the signed 16-bit results in dst.

Operation

dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0]) dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32]) dst[47:32] = Saturate_To_Int16(b[31:16] + b[15:0]) dst[63:48] = Saturate_To_Int16(b[63:48] + b[47:32])
phsubw
__m128i _mm_hsub_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_hsub_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: phsubw xmm, xmm
CPUID Flags: SSSE3

Description

Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack the signed 16-bit results in dst.

Operation

dst[15:0] := a[15:0] - a[31:16] dst[31:16] := a[47:32] - a[63:48] dst[47:32] := a[79:64] - a[95:80] dst[63:48] := a[111:96] - a[127:112] dst[79:64] := b[15:0] - b[31:16] dst[95:80] := b[47:32] - b[63:48] dst[111:96] := b[79:64] - b[95:80] dst[127:112] := b[111:96] - b[127:112]

Performance

ArchitectureLatencyThroughput
Haswell31.5
Ivy Bridge31.5
Sandy Bridge31.5
Westmere31.5
Nehalem31.5
vphsubw
__m256i _mm256_hsub_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_hsub_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vphsubw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack the signed 16-bit results in dst.

Operation

dst[15:0] := a[15:0] - a[31:16] dst[31:16] := a[47:32] - a[63:48] dst[47:32] := a[79:64] - a[95:80] dst[63:48] := a[111:96] - a[127:112] dst[79:64] := b[15:0] - b[31:16] dst[95:80] := b[47:32] - b[63:48] dst[111:96] := b[79:64] - b[95:80] dst[127:112] := b[111:96] - b[127:112] dst[143:128] := a[143:128] - a[159:144] dst[159:144] := a[175:160] - a[191:176] dst[175:160] := a[207:192] - a[223:208] dst[191:176] := a[239:224] - a[255:240] dst[207:192] := b[143:128] - b[159:144] dst[223:208] := b[175:160] - b[191:176] dst[239:224] := b[207:192] - b[223:208] dst[255:240] := b[239:224] - b[255:240] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
phsubd
__m128i _mm_hsub_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_hsub_epi32 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: phsubd xmm, xmm
CPUID Flags: SSSE3

Description

Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack the signed 32-bit results in dst.

Operation

dst[31:0] := a[31:0] - a[63:32] dst[63:32] := a[95:64] - a[127:96] dst[95:64] := b[31:0] - b[63:32] dst[127:96] := b[95:64] - b[127:96]

Performance

ArchitectureLatencyThroughput
Haswell31.5
Ivy Bridge31.5
Sandy Bridge31.5
Westmere31.5
Nehalem31.5
vphsubd
__m256i _mm256_hsub_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_hsub_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vphsubd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack the signed 32-bit results in dst.

Operation

dst[31:0] := a[31:0] - a[63:32] dst[63:32] := a[95:64] - a[127:96] dst[95:64] := b[31:0] - b[63:32] dst[127:96] := b[95:64] - b[127:96] dst[159:128] := a[159:128] - a[191:160] dst[191:160] := a[223:192] - a[255:224] dst[223:192] := b[159:128] - b[191:160] dst[255:224] := b[223:192] - b[255:224] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
hsubpd
__m128d _mm_hsub_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_hsub_pd (__m128d a, __m128d b)
#include "pmmintrin.h"
Instruction: hsubpd xmm, xmm
CPUID Flags: SSE3

Description

Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the results in dst.

Operation

dst[63:0] := a[63:0] - a[127:64] dst[127:64] := b[63:0] - b[127:64]

Performance

ArchitectureLatencyThroughput
Haswell52
Ivy Bridge52
Sandy Bridge52
Westmere52
Nehalem52
vhsubpd
__m256d _mm256_hsub_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_hsub_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vhsubpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the results in dst.

Operation

dst[63:0] := a[63:0] - a[127:64] dst[127:64] := b[63:0] - b[127:64] dst[191:128] := a[191:128] - a[255:192] dst[255:192] := b[191:128] - b[255:192] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell5-
Ivy Bridge5-
Sandy Bridge5-
phsubw
__m64 _mm_hsub_pi16 (__m64 a, __m64 b)

Synopsis

__m64 _mm_hsub_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: phsubw mm, mm
CPUID Flags: SSSE3

Description

Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack the signed 16-bit results in dst.

Operation

dst[15:0] := a[15:0] - a[31:16] dst[31:16] := a[47:32] - a[63:48] dst[47:32] := b[15:0] - b[31:16] dst[63:48] := b[47:32] - b[63:48]
phsubd
__m64 _mm_hsub_pi32 (__m64 a, __m64 b)

Synopsis

__m64 _mm_hsub_pi32 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: phsubd mm, mm
CPUID Flags: SSSE3

Description

Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack the signed 32-bit results in dst.

Operation

dst[31:0] := a[31:0] - a[63:32] dst[63:32] := b[31:0] - b[63:32]
hsubps
__m128 _mm_hsub_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_hsub_ps (__m128 a, __m128 b)
#include "pmmintrin.h"
Instruction: hsubps xmm, xmm
CPUID Flags: SSE3

Description

Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the results in dst.

Operation

dst[31:0] := a[31:0] - a[63:32] dst[63:32] := a[95:64] - a[127:96] dst[95:64] := b[31:0] - b[63:32] dst[127:96] := b[95:64] - b[127:96]

Performance

ArchitectureLatencyThroughput
Haswell52
Ivy Bridge52
Sandy Bridge52
Westmere52
Nehalem52
vhsubps
__m256 _mm256_hsub_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_hsub_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vhsubps ymm, ymm, ymm
CPUID Flags: AVX

Description

Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the results in dst.

Operation

dst[31:0] := a[31:0] - a[63:32] dst[63:32] := a[95:64] - a[127:96] dst[95:64] := b[31:0] - b[63:32] dst[127:96] := b[95:64] - b[127:96] dst[159:128] := a[159:128] - a[191:160] dst[191:160] := a[223:192] - a[255:224] dst[223:192] := b[159:128] - b[191:160] dst[255:224] := b[223:192] - b[255:224] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell5-
Ivy Bridge5-
Sandy Bridge5-
phsubsw
__m128i _mm_hsubs_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_hsubs_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: phsubsw xmm, xmm
CPUID Flags: SSSE3

Description

Horizontally subtract adjacent pairs of 16-bit integers in a and b using saturation, and pack the signed 16-bit results in dst.

Operation

dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16]) dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48]) dst[47:32] = Saturate_To_Int16(a[79:64] - a[95:80]) dst[63:48] = Saturate_To_Int16(a[111:96] - a[127:112]) dst[79:64] = Saturate_To_Int16(b[15:0] - b[31:16]) dst[95:80] = Saturate_To_Int16(b[47:32] - b[63:48]) dst[111:96] = Saturate_To_Int16(b[79:64] - b[95:80]) dst[127:112] = Saturate_To_Int16(b[111:96] - b[127:112])

Performance

ArchitectureLatencyThroughput
Haswell31.5
Ivy Bridge31.5
Sandy Bridge31.5
Westmere31.5
Nehalem31.5
vphsubsw
__m256i _mm256_hsubs_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_hsubs_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vphsubsw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Horizontally subtract adjacent pairs of 16-bit integers in a and b using saturation, and pack the signed 16-bit results in dst.

Operation

dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16]) dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48]) dst[47:32] = Saturate_To_Int16(a[79:64] - a[95:80]) dst[63:48] = Saturate_To_Int16(a[111:96] - a[127:112]) dst[79:64] = Saturate_To_Int16(b[15:0] - b[31:16]) dst[95:80] = Saturate_To_Int16(b[47:32] - b[63:48]) dst[111:96] = Saturate_To_Int16(b[79:64] - b[95:80]) dst[127:112] = Saturate_To_Int16(b[111:96] - b[127:112]) dst[143:128]= Saturate_To_Int16(a[143:128] - a[159:144]) dst[159:144] = Saturate_To_Int16(a[175:160] - a[191:176]) dst[175:160] = Saturate_To_Int16(a[207:192] - a[223:208]) dst[191:176] = Saturate_To_Int16(a[239:224] - a[255:240]) dst[207:192] = Saturate_To_Int16(b[143:128] - b[159:144]) dst[223:208] = Saturate_To_Int16(b[175:160] - b[191:176]) dst[239:224] = Saturate_To_Int16(b[207:192] - b[223:208]) dst[255:240] = Saturate_To_Int16(b[239:224] - b[255:240]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
phsubsw
__m64 _mm_hsubs_pi16 (__m64 a, __m64 b)

Synopsis

__m64 _mm_hsubs_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: phsubsw mm, mm
CPUID Flags: SSSE3

Description

Horizontally subtract adjacent pairs of 16-bit integers in a and b using saturation, and pack the signed 16-bit results in dst.

Operation

dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16]) dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48]) dst[47:32] = Saturate_To_Int16(b[15:0] - b[31:16]) dst[63:48] = Saturate_To_Int16(b[47:32] - b[63:48])
...
__m128d _mm_hypot_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_hypot_pd (__m128d a, __m128d b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_hypot_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_hypot_pd (__m256d a, __m256d b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_hypot_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_hypot_pd (__m512d a, __m512d b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_hypot_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_hypot_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_hypot_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_hypot_ps (__m128 a, __m128 b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_hypot_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_hypot_ps (__m256 a, __m256 b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_hypot_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_hypot_ps (__m512 a, __m512 b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_hypot_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_hypot_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpgatherdd
__m512i _mm512_i32extgather_epi32 (__m512i index, void const * mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)

Synopsis

__m512i _mm512_i32extgather_epi32 (__m512i index, void const * mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpgatherdd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Up-converts 16 memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv to 32-bit integer elements and stores them in dst.

Operation

FOR j := 0 to 15 addr := MEM[mv + index[j] * scale] i := j*32 CASE conv OF _MM_UPCONV_EPI32_NONE: dst[i+31:i] := addr[i+31:i] _MM_UPCONV_EPI32_UINT8: n := j*7 dst[i+31:i] := UInt8ToUInt32(addr[n+7:n]) _MM_UPCONV_EPI32_SINT8: n := j*7 dst[i+31:i] := Int8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_UINT16: n := j*16 dst[i+31:i] := UInt16ToUInt32(addr[n+15:n]) _MM_UPCONV_EPI32_SINT16: n := j*16 dst[i+31:i] := Int16ToInt32(addr[n+15:n]) ESAC ENDFOR dst[MAX:512] := 0
vpgatherdd
__m512i _mm512_mask_i32extgather_epi32 (__m512i src, __mmask16 k, __m512i index, void const * mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)

Synopsis

__m512i _mm512_mask_i32extgather_epi32 (__m512i src, __mmask16 k, __m512i index, void const * mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpgatherdd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Up-converts 16 single-precision (32-bit) memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv to 32-bit integer elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 addr := MEM[mv + index[j] * scale] i := j*32 IF k[j] CASE conv OF _MM_UPCONV_EPI32_NONE: dst[i+31:i] := addr[i+31:i] _MM_UPCONV_EPI32_UINT8: n := j*7 dst[i+31:i] := UInt8ToUInt32(addr[n+7:n]) _MM_UPCONV_EPI32_SINT8: n := j*7 dst[i+31:i] := Int8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_UINT16: n := j*16 dst[i+31:i] := UInt16ToUInt32(addr[n+15:n]) _MM_UPCONV_EPI32_SINT16: n := j*16 dst[i+31:i] := Int16ToInt32(addr[n+15:n]) ESAC ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vgatherdps
__m512 _mm512_i32extgather_ps (__m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)

Synopsis

__m512 _mm512_i32extgather_ps (__m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherdps zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Up-converts 16 memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv to single-precision (32-bit) floating-point elements and stores them in dst.

Operation

FOR j := 0 to 15 addr := MEM[mv + index[j] * scale] i := j*32 CASE conv OF _MM_UPCONV_PS_NONE: dst[i+31:i] := addr[i+31:i] _MM_UPCONV_PS_FLOAT16: n := j*16 dst[i+31:i] := Float16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_UINT8: n := j*8 dst[i+31:i] := UInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_SINT8: n := j*8 dst[i+31:i] := SInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_UINT16: n := j*16 dst[i+31:i] := UInt16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_SINT16: n := j*16 dst[i+31:i] := SInt16ToFloat32(addr[n+15:n]) ESAC ENDFOR dst[MAX:512] := 0
vgatherdps
__m512 _mm512_mask_i32extgather_ps (__m512 src, __mmask16 k, __m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)

Synopsis

__m512 _mm512_mask_i32extgather_ps (__m512 src, __mmask16 k, __m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherdps zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Up-converts 16 single-precision (32-bit) memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv to single-precision (32-bit) floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 addr := MEM[mv + index[j] * scale] i := j*32 IF k[j] CASE conv OF _MM_UPCONV_PS_NONE: dst[i+31:i] := addr[i+31:i] _MM_UPCONV_PS_FLOAT16: n := j*16 dst[i+31:i] := Float16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_UINT8: n := j*8 dst[i+31:i] := UInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_SINT8: n := j*8 dst[i+31:i] := SInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_UINT16: n := j*16 dst[i+31:i] := UInt16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_SINT16: n := j*16 dst[i+31:i] := SInt16ToFloat32(addr[n+15:n]) ESAC ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpscatterdd
void _mm512_i32extscatter_epi32 (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)

Synopsis

void _mm512_i32extscatter_epi32 (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpscatterdd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Down-converts 16 packed 32-bit integer elements in v1 using conv and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale. hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 15 addr := MEM[mv + index[j] * scale] i := j*32 CASE conv OF _MM_DOWNCONV_EPI32_NONE: addr[i+31:i] := v1[i+31:i] _MM_DOWNCONV_EPI32_UINT8: n := j*8 addr[n+7:n] := UInt32ToUInt8(v1[i+31:i]) _MM_DOWNCONV_EPI32_SINT8: n := j*8 addr[n+7:n] := SInt32ToSInt8(v1[i+31:i]) _MM_DOWNCONV_EPI32_UINT16: n := j*16 addr[n+15:n] := UInt32ToUInt16(v1[i+31:i]) _MM_DOWNCONV_EPI32_SINT16: n := j*16 addr[n+15:n] := SInt32ToSInt16(v1[n+15:n]) ESAC ENDFOR
vpscatterdd
void _mm512_mask_i32extscatter_epi32 (void * mv, __mmask16 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)

Synopsis

void _mm512_mask_i32extscatter_epi32 (void * mv, __mmask16 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpscatterdd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Down-converts 16 packed 32-bit integer elements in v1 using conv and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale. Elements are written using writemask k (elements are only written when the corresponding mask bit is set; otherwise, elements are left unchanged in memory). hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 15 addr := MEM[mv + index[j] * scale] i := j*32 IF k[j] CASE conv OF _MM_DOWNCONV_EPI32_NONE: addr[i+31:i] := v1[i+31:i] _MM_DOWNCONV_EPI32_UINT8: n := j*8 addr[n+7:n] := UInt32ToUInt8(v1[i+31:i]) _MM_DOWNCONV_EPI32_SINT8: n := j*8 addr[n+7:n] := SInt32ToSInt8(v1[i+31:i]) _MM_DOWNCONV_EPI32_UINT16: n := j*16 addr[n+15:n] := UInt32ToUInt16(v1[i+31:i]) _MM_DOWNCONV_EPI32_SINT16: n := j*16 addr[n+15:n] := SInt32ToSInt16(v1[n+15:n]) ESAC FI ENDFOR
vscatterdps
void _mm512_i32extscatter_ps (void * mv, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int scale)

Synopsis

void _mm512_i32extscatter_ps (void * mv, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int scale)
#include "immintrin.h"
Instruction: vscatterdps m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Down-converts 16 packed single-precision (32-bit) floating-point elements in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv.

Operation

FOR j := 0 to 15 addr := MEM[mv + index[j] * scale] i := j*32 CASE conv OF _MM_DOWNCONV_PS_NONE: n := j*32 addr[i+31:i] := v1[n+31:n] _MM_DOWNCONV_PS_FLOAT16: i := j*16 addr[i+15:i] := Float32ToFloat16(v1[n+31:n]) _MM_DOWNCONV_PS_UINT8: i := j*8 addr[i+7:i] := Float32ToUInt8(v1[n+31:n]) _MM_DOWNCONV_PS_SINT8: i := j*8 addr[i+7:i] := Float32ToSInt8(v1[n+31:n]) _MM_DOWNCONV_PS_UINT16: i := j*8 addr[i+15:i] := Float32ToUInt16(v1[n+31:n]) _MM_DOWNCONV_PS_SINT16: i := j*8 addr[i+15:i] := Float32ToSInt16(v1[n+31:n]) ESAC ENDFOR
vscatterdps
void _mm512_mask_i32extscatter_ps (void * mv, __mmask16 k, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int hint)

Synopsis

void _mm512_mask_i32extscatter_ps (void * mv, __mmask16 k, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterdps m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Down-converts 16 packed single-precision (32-bit) floating-point elements in v1 according to conv and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using writemask k (elements are written only when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] addr := MEM[mv + index[j] * scale] CASE conv OF _MM_DOWNCONV_PS_NONE: n := j*32 addr[i+31:i] := v1[n+31:n] _MM_DOWNCONV_PS_FLOAT16: i := j*16 addr[i+15:i] := Float32ToFloat16(v1[n+31:n]) _MM_DOWNCONV_PS_UINT8: i := j*8 addr[i+7:i] := Float32ToUInt8(v1[n+31:n]) _MM_DOWNCONV_PS_SINT8: i := j*8 addr[i+7:i] := Float32ToSInt8(v1[n+31:n]) _MM_DOWNCONV_PS_UINT16: i := j*8 addr[i+15:i] := Float32ToUInt16(v1[n+31:n]) _MM_DOWNCONV_PS_SINT16: i := j*8 addr[i+15:i] := Float32ToSInt16(v1[n+31:n]) ESAC FI ENDFOR
vpgatherdd
__m128i _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale)

Synopsis

__m128i _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherdd xmm, vm32x, xmm
CPUID Flags: AVX2

Description

Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherdd
__m128i _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)

Synopsis

__m128i _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherdd xmm, vm32x, xmm
CPUID Flags: AVX2

Description

Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 IF mask[i+31] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] mask[i+31] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR mask[MAX:128] := 0 dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherdd
__m128i _mm_mmask_i32gather_epi32 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)

Synopsis

__m128i _mm_mmask_i32gather_epi32 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherdd
CPUID Flags: AVX512VL + AVX512F

Description

Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] k[j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR k[MAX:4] := 0 dst[MAX:128] := 0
vpgatherdd
__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)

Synopsis

__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherdd ymm, vm32x, ymm
CPUID Flags: AVX2

Description

Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherdd
__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)

Synopsis

__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherdd ymm, vm32x, ymm
CPUID Flags: AVX2

Description

Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 IF mask[i+31] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] mask[i+31] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR mask[MAX:256] := 0 dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherdd
__m256i _mm256_mmask_i32gather_epi32 (__m256i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)

Synopsis

__m256i _mm256_mmask_i32gather_epi32 (__m256i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherdd
CPUID Flags: AVX512VL + AVX512F

Description

Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] k[j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR k[MAX:8] := 0 dst[MAX:256] := 0
vpgatherdd
__m512i _mm512_i32gather_epi32 (__m512i vindex, void const* base_addr, int scale)

Synopsis

__m512i _mm512_i32gather_epi32 (__m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherdd zmm {k}, vm32z
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] ENDFOR dst[MAX:512] := 0
vpgatherdd
__m512i _mm512_mask_i32gather_epi32 (__m512i src, __mmask16 k, __m512i vindex, void const* base_addr, int scale)

Synopsis

__m512i _mm512_mask_i32gather_epi32 (__m512i src, __mmask16 k, __m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherdd zmm {k}, vm32z
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] k[j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR k[MAX:16] := 0 dst[MAX:512] := 0
vpgatherdq
__m128i _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)

Synopsis

__m128i _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherdq xmm, vm64x, xmm
CPUID Flags: AVX2

Description

Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 m := j*32 dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherdq
__m128i _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)

Synopsis

__m128i _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherdq xmm, vm64x, xmm
CPUID Flags: AVX2

Description

Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 m := j*32 IF mask[i+63] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] mask[i+63] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR mask[MAX:128] := 0 dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherdq
__m128i _mm_mmask_i32gather_epi64 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)

Synopsis

__m128i _mm_mmask_i32gather_epi64 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherdq
CPUID Flags: AVX512VL + AVX512F

Description

Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 m := j*32 IF k[j] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] k[j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR k[MAX:2] := 0 dst[MAX:128] := 0
vpgatherdq
__m256i _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)

Synopsis

__m256i _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherdq ymm, vm64x, ymm
CPUID Flags: AVX2

Description

Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 m := j*32 dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherdq
__m256i _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale)

Synopsis

__m256i _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherdq ymm, vm64x, ymm
CPUID Flags: AVX2

Description

Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 m := j*32 IF mask[i+63] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] mask[i+63] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR mask[MAX:256] := 0 dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherdq
__m256i _mm256_mmask_i32gather_epi64 (__m256i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)

Synopsis

__m256i _mm256_mmask_i32gather_epi64 (__m256i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherdq
CPUID Flags: AVX512VL + AVX512F

Description

Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 m := j*32 IF k[j] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] k[j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR k[MAX:4] := 0 dst[MAX:256] := 0
vpgatherdq
__m512i _mm512_i32gather_epi64 (__m256i vindex, void const* base_addr, int scale)

Synopsis

__m512i _mm512_i32gather_epi64 (__m256i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherdq zmm {k}, vm32y
CPUID Flags: AVX512F

Description

Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 m := j*32 dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] ENDFOR dst[MAX:512] := 0
vpgatherdq
__m512i _mm512_mask_i32gather_epi64 (__m512i src, __mmask8 k, __m256i vindex, void const* base_addr, int scale)

Synopsis

__m512i _mm512_mask_i32gather_epi64 (__m512i src, __mmask8 k, __m256i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherdq zmm {k}, vm32y
CPUID Flags: AVX512F

Description

Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 m := j*32 IF k[j] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] k[j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR k[MAX:8] := 0 dst[MAX:512] := 0
vgatherdpd
__m128d _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)

Synopsis

__m128d _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherdpd xmm, vm64x, xmm
CPUID Flags: AVX2

Description

Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 m := j*32 dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherdpd
__m128d _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)

Synopsis

__m128d _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)
#include "immintrin.h"
Instruction: vgatherdpd xmm, vm64x, xmm
CPUID Flags: AVX2

Description

Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 m := j*32 IF mask[i+63] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] mask[i+63] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR mask[MAX:128] := 0 dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherdpd
__m128d _mm_mmask_i32gather_pd (__m128d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)

Synopsis

__m128d _mm_mmask_i32gather_pd (__m128d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherdpd
CPUID Flags: AVX512VL + AVX512F

Description

Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 m := j*32 IF k[j] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] k[j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR k[MAX:2] := 0 dst[MAX:128] := 0
vgatherdpd
__m256d _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)

Synopsis

__m256d _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherdpd ymm, vm64x, ymm
CPUID Flags: AVX2

Description

Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 m := j*32 dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherdpd
__m256d _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale)

Synopsis

__m256d _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale)
#include "immintrin.h"
Instruction: vgatherdpd ymm, vm64x, ymm
CPUID Flags: AVX2

Description

Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 m := j*32 IF mask[i+63] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] mask[i+63] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR mask[MAX:256] := 0 dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherdpd
__m256d _mm256_mmask_i32gather_pd (__m256d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)

Synopsis

__m256d _mm256_mmask_i32gather_pd (__m256d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherdpd
CPUID Flags: AVX512VL + AVX512F

Description

Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 m := j*32 IF k[j] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] k[j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR k[MAX:4] := 0 dst[MAX:256] := 0
vgatherdpd
__m512d _mm512_i32gather_pd (__m256i vindex, void const* base_addr, int scale)

Synopsis

__m512d _mm512_i32gather_pd (__m256i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherdpd zmm {k}, vm32y
CPUID Flags: AVX512F

Description

Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 m := j*32 dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] ENDFOR dst[MAX:512] := 0
vgatherdpd
__m512d _mm512_mask_i32gather_pd (__m512d src, __mmask8 k, __m256i vindex, void const* base_addr, int scale)

Synopsis

__m512d _mm512_mask_i32gather_pd (__m512d src, __mmask8 k, __m256i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherdpd zmm {k}, vm32y
CPUID Flags: AVX512F

Description

Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 m := j*32 IF k[j] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale] k[j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR k[MAX:8] := 0 dst[MAX:512] := 0
vgatherdps
__m128 _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale)

Synopsis

__m128 _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherdps xmm, vm32x, xmm
CPUID Flags: AVX2

Description

Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherdps
__m128 _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)

Synopsis

__m128 _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)
#include "immintrin.h"
Instruction: vgatherdps xmm, vm32x, xmm
CPUID Flags: AVX2

Description

Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 IF mask[i+31] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] mask[i+31] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR mask[MAX:128] := 0 dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherdps
__m128 _mm_mmask_i32gather_ps (__m128 src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)

Synopsis

__m128 _mm_mmask_i32gather_ps (__m128 src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherdps
CPUID Flags: AVX512VL + AVX512F

Description

Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] k[j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR k[MAX:4] := 0 dst[MAX:128] := 0
vgatherdps
__m256 _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale)

Synopsis

__m256 _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherdps ymm, vm32x, ymm
CPUID Flags: AVX2

Description

Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherdps
__m256 _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale)

Synopsis

__m256 _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale)
#include "immintrin.h"
Instruction: vgatherdps ymm, vm32x, ymm
CPUID Flags: AVX2

Description

Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 IF mask[i+31] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] mask[i+31] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR mask[MAX:256] := 0 dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherdps
__m256 _mm256_mmask_i32gather_ps (__m256 src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)

Synopsis

__m256 _mm256_mmask_i32gather_ps (__m256 src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherdps
CPUID Flags: AVX512VL + AVX512F

Description

Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] k[j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR k[MAX:8] := 0 dst[MAX:256] := 0
vgatherdps
__m512 _mm512_i32gather_ps (__m512i vindex, void const* base_addr, int scale)

Synopsis

__m512 _mm512_i32gather_ps (__m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherdps zmm {k}, vm32z
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] ENDFOR dst[MAX:512] := 0
vgatherdps
__m512 _mm512_mask_i32gather_ps (__m512 src, __mmask16 k, __m512i vindex, void const* base_addr, int scale)

Synopsis

__m512 _mm512_mask_i32gather_ps (__m512 src, __mmask16 k, __m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherdps zmm {k}, vm32z
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale] k[j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR k[MAX:16] := 0 dst[MAX:512] := 0
vpgatherdq
__m512i _mm512_i32loextgather_epi64 (__m512i index, void const * mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)

Synopsis

__m512i _mm512_i32loextgather_epi64 (__m512i index, void const * mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpgatherdq zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Up-converts 8 double-precision (64-bit) memory locations starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale using conv to 64-bit integer elements and stores them in dst.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 CASE conv OF _MM_UPCONV_EPI64_NONE: dst[i+63:i] := addr[i+63:i] ESAC ENDFOR dst[MAX:512] := 0
vpgatherdq
__m512i _mm512_mask_i32loextgather_epi64 (__m512i src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)

Synopsis

__m512i _mm512_mask_i32loextgather_epi64 (__m512i src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpgatherdq zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Up-converts 8 double-precision (64-bit) memory locations starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale using conv to 64-bit integer elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 IF k[j] CASE conv OF _MM_UPCONV_EPI64_NONE: dst[i+63:i] := addr[i+63:i] ESAC ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vgatherdpd
__m512d _mm512_i32loextgather_pd (__m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)

Synopsis

__m512d _mm512_i32loextgather_pd (__m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherdpd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Up-converts 8 double-precision (64-bit) floating-point elements in memory locations starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale using conv to 64-bit floating-point elements and stores them in dst.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 CASE conv OF _MM_UPCONV_PD_NONE: dst[i+63:i] := addr[i+63:i] ESAC ENDFOR dst[MAX:512] := 0
vgatherdpd
__m512d _mm512_mask_i32loextgather_pd (__m512d src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)

Synopsis

__m512d _mm512_mask_i32loextgather_pd (__m512d src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherdpd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Up-converts 8 double-precision (64-bit) floating-point elements in memory locations starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale using conv to 64-bit floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 IF k[j] CASE conv OF _MM_UPCONV_PD_NONE: dst[i+63:i] := addr[i+63:i] ESAC ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpscatterdq
void _mm512_i32loextscatter_epi64 (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)

Synopsis

void _mm512_i32loextscatter_epi64 (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpscatterdq m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Down-converts 8 packed 64-bit integer elements in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 CASE conv OF _MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i] ESAC ENDFOR
vpscatterdq
void _mm512_mask_i32loextscatter_epi64 (void * mv, __mmask8 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)

Synopsis

void _mm512_mask_i32loextscatter_epi64 (void * mv, __mmask8 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpscatterdq m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Down-converts 8 packed 64-bit integer elements in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv. Only those elements whose corresponding mask bit is set in writemask k are written to memory.

Operation

FOR j := 0 to 7 IF k[j] addr := MEM[mv + index[j] * scale] i := j*64 CASE conv OF _MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i] ESAC FI ENDFOR
vscatterdpd
void _mm512_i32loextscatter_pd (void * mv, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)

Synopsis

void _mm512_i32loextscatter_pd (void * mv, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterdpd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Down-converts 8 packed double-precision (64-bit) floating-point elements in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 CASE conv OF _MM_DOWNCONV_PD_NONE: addr[i+63:i] := v1[i+63:i] ESAC ENDFOR
vscatterdpd
void _mm512_mask_i32loextscatter_pd (void * mv, __mmask8 k, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)

Synopsis

void _mm512_mask_i32loextscatter_pd (void * mv, __mmask8 k, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterdpd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Down-converts 8 packed double-precision (64-bit) floating-point elements in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv. Only those elements whose corresponding mask bit is set in writemask k are written to memory.

Operation

FOR j := 0 to 7 IF k[j] addr := MEM[mv + index[j] * scale] i := j*64 CASE conv OF _MM_DOWNCONV_PD_NONE: addr[i+63:i] := v1[i+63:i] ESAC FI ENDFOR
vpgatherdq
__m512i _mm512_i32logather_epi64 (__m512i index, void const* mv, int scale)

Synopsis

__m512i _mm512_i32logather_epi64 (__m512i index, void const* mv, int scale)
#include "immintrin.h"
Instruction: vpgatherdq zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Loads 8 64-bit integer elements from memory starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale and stores them in dst.

Operation

FOR j := 0 to 7 i := j*64 addr := MEM[mv + index[j] * scale] dst[i+63:i] := addr[i+63:i] ENDFOR dst[MAX:512] := 0
vpgatherdq
__m512i _mm512_mask_i32logather_epi64 (__m512i src, __mmask8 k, __m512i index, void const* mv, int scale)

Synopsis

__m512i _mm512_mask_i32logather_epi64 (__m512i src, __mmask8 k, __m512i index, void const* mv, int scale)
#include "immintrin.h"
Instruction: vpgatherdq zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Loads 8 64-bit integer elements from memory starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] addr := MEM[mv + index[j] * scale] dst[i+63:i] := addr[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vgatherdpd
__m512d _mm512_i32logather_pd (__m512i index, void const* mv, int scale)

Synopsis

__m512d _mm512_i32logather_pd (__m512i index, void const* mv, int scale)
#include "immintrin.h"
Instruction: vgatherdpd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Loads 8 double-precision (64-bit) floating-point elements stored at memory locations starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale them in dst.

Operation

FOR j := 0 to 7 i := j*64 addr := MEM[mv + index[j] * scale] dst[i+63:i] := addr[i+63:i] ENDFOR dst[MAX:512] := 0
vgatherdpd
__m512d _mm512_mask_i32logather_pd (__m512d src, __mmask8 k, __m512i index, void const* mv, int scale)

Synopsis

__m512d _mm512_mask_i32logather_pd (__m512d src, __mmask8 k, __m512i index, void const* mv, int scale)
#include "immintrin.h"
Instruction: vgatherdpd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Loads 8 double-precision (64-bit) floating-point elements from memory starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] addr := MEM[mv + index[j] * scale] dst[i+63:i] := addr[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpscatterdq
void _mm512_i32loscatter_epi64 (void* mv, __m512i index, __m512i v1, int scale)

Synopsis

void _mm512_i32loscatter_epi64 (void* mv, __m512i index, __m512i v1, int scale)
#include "immintrin.h"
Instruction: vpscatterdq m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores 8 packed 64-bit integer elements located in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 addr[i+63:i] := v1[k+63:j] ENDFOR
vpscatterdq
void _mm512_mask_i32loscatter_epi64 (void* mv, __mmask8 k, __m512i index, __m512i v1, int scale)

Synopsis

void _mm512_mask_i32loscatter_epi64 (void* mv, __mmask8 k, __m512i index, __m512i v1, int scale)
#include "immintrin.h"
Instruction: vpscatterdq m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores 8 packed 64-bit integer elements located in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using writemask k (elements whose corresponding mask bit is not set are not written to memory).

Operation

FOR j := 0 to 7 IF k[j] addr := MEM[mv + index[j] * scale] addr[i+63:i] := v1[i+63:i] FI ENDFOR
vscatterdpd
void _mm512_i32loscatter_pd (void* mv, __m512i index, __m512d v1, int scale)

Synopsis

void _mm512_i32loscatter_pd (void* mv, __m512i index, __m512d v1, int scale)
#include "immintrin.h"
Instruction: vscatterdpd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Stores 8 packed double-precision (64-bit) floating-point elements in v1 and to memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 addr[i+63:i] := v1[k+63:j] ENDFOR
vscatterdpd
void _mm512_mask_i32loscatter_pd (void* mv, __mmask8 k, __m512i index, __m512d v1, int scale)

Synopsis

void _mm512_mask_i32loscatter_pd (void* mv, __mmask8 k, __m512i index, __m512d v1, int scale)
#include "immintrin.h"
Instruction: vscatterdpd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Stores 8 packed double-precision (64-bit) floating-point elements in v1 to memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale. Only those elements whose corresponding mask bit is set in writemask k are written to memory.

Operation

FOR j := 0 to 7 IF k[j] addr := MEM[mv + index[j] * scale] i := j*64 addr[i+63:i] := v1[k+63:j] FI ENDFOR
vpscatterdd
void _mm_i32scatter_epi32 (void* base_addr, __m128i vindex, __m128i a, const int scale)

Synopsis

void _mm_i32scatter_epi32 (void* base_addr, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i] ENDFOR
vpscatterdd
void _mm_mask_i32scatter_epi32 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)

Synopsis

void _mm_mask_i32scatter_epi32 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i] k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpscatterdd
void _mm256_i32scatter_epi32 (void* base_addr, __m256i vindex, __m256i a, const int scale)

Synopsis

void _mm256_i32scatter_epi32 (void* base_addr, __m256i vindex, __m256i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i] ENDFOR
vpscatterdd
void _mm256_mask_i32scatter_epi32 (void* base_addr, __mmask8 k, __m256i vindex, __m256i a, const int scale)

Synopsis

void _mm256_mask_i32scatter_epi32 (void* base_addr, __mmask8 k, __m256i vindex, __m256i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i] k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpscatterdd
void _mm512_i32scatter_epi32 (void* base_addr, __m512i vindex, __m512i a, int scale)

Synopsis

void _mm512_i32scatter_epi32 (void* base_addr, __m512i vindex, __m512i a, int scale)
#include "immintrin.h"
Instruction: vpscatterdd vm32z {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 15 i := j*32 MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i] ENDFOR
vpscatterdd
void _mm512_mask_i32scatter_epi32 (void* base_addr, __mmask16 k, __m512i vindex, __m512i a, int scale)

Synopsis

void _mm512_mask_i32scatter_epi32 (void* base_addr, __mmask16 k, __m512i vindex, __m512i a, int scale)
#include "immintrin.h"
Instruction: vpscatterdd vm32z {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i] k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpscatterdq
void _mm_i32scatter_epi64 (void* base_addr, __m128i vindex, __m128i a, const int scale)

Synopsis

void _mm_i32scatter_epi64 (void* base_addr, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdq
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 l := j*32 MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i] ENDFOR
vpscatterdq
void _mm_mask_i32scatter_epi64 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)

Synopsis

void _mm_mask_i32scatter_epi64 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdq
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i] k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpscatterdq
void _mm256_i32scatter_epi64 (void* base_addr, __m128i vindex, __m256i a, const int scale)

Synopsis

void _mm256_i32scatter_epi64 (void* base_addr, __m128i vindex, __m256i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdq
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 l := j*32 MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i] ENDFOR
vpscatterdq
void _mm256_mask_i32scatter_epi64 (void* base_addr, __mmask8 k, __m128i vindex, __m256i a, const int scale)

Synopsis

void _mm256_mask_i32scatter_epi64 (void* base_addr, __mmask8 k, __m128i vindex, __m256i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdq
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i] k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpscatterdq
void _mm512_i32scatter_epi64 (void* base_addr, __m256i vindex, __m512i a, int scale)

Synopsis

void _mm512_i32scatter_epi64 (void* base_addr, __m256i vindex, __m512i a, int scale)
#include "immintrin.h"
Instruction: vpscatterdq vz32y {k}, zmm
CPUID Flags: AVX512F

Description

Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i] ENDFOR
vpscatterdq
void _mm512_mask_i32scatter_epi64 (void* base_addr, __mmask8 k, __m256i vindex, __m512i a, int scale)

Synopsis

void _mm512_mask_i32scatter_epi64 (void* base_addr, __mmask8 k, __m256i vindex, __m512i a, int scale)
#include "immintrin.h"
Instruction: vpscatterdq vz32y {k}, zmm
CPUID Flags: AVX512F

Description

Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i] k[j] := 0 FI ENDFOR k[MAX:8] := 0
vscatterdpd
void _mm_i32scatter_pd (void* base_addr, __m128i vindex, __m128d a, const int scale)

Synopsis

void _mm_i32scatter_pd (void* base_addr, __m128i vindex, __m128d a, const int scale)
#include "immintrin.h"
Instruction: vscatterdpd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 l := j*32 MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i] ENDFOR
vscatterdpd
void _mm_mask_i32scatter_pd (void* base_addr, __mmask8 k, __m128i vindex, __m128d a, const int scale)

Synopsis

void _mm_mask_i32scatter_pd (void* base_addr, __mmask8 k, __m128i vindex, __m128d a, const int scale)
#include "immintrin.h"
Instruction: vscatterdpd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i] k[j] := 0 FI ENDFOR k[MAX:2] := 0
vscatterdpd
void _mm256_i32scatter_pd (void* base_addr, __m128i vindex, __m256d a, const int scale)

Synopsis

void _mm256_i32scatter_pd (void* base_addr, __m128i vindex, __m256d a, const int scale)
#include "immintrin.h"
Instruction: vscatterdpd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 l := j*32 MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i] ENDFOR
vscatterdpd
void _mm256_mask_i32scatter_pd (void* base_addr, __mmask8 k, __m128i vindex, __m256d a, const int scale)

Synopsis

void _mm256_mask_i32scatter_pd (void* base_addr, __mmask8 k, __m128i vindex, __m256d a, const int scale)
#include "immintrin.h"
Instruction: vscatterdpd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i] k[j] := 0 FI ENDFOR k[MAX:4] := 0
vscatterdpd
void _mm512_i32scatter_pd (void* base_addr, __m256i vindex, __m512d a, int scale)

Synopsis

void _mm512_i32scatter_pd (void* base_addr, __m256i vindex, __m512d a, int scale)
#include "immintrin.h"
Instruction: vscatterdpd vm32y {k}, zmm
CPUID Flags: AVX512F

Description

Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i] ENDFOR
vscatterdpd
void _mm512_mask_i32scatter_pd (void* base_addr, __mmask8 k, __m256i vindex, __m512d a, int scale)

Synopsis

void _mm512_mask_i32scatter_pd (void* base_addr, __mmask8 k, __m256i vindex, __m512d a, int scale)
#include "immintrin.h"
Instruction: vscatterdpd vm32y {k}, zmm
CPUID Flags: AVX512F

Description

Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i] k[j] := 0 FI ENDFOR k[MAX:8] := 0
vscatterdps
void _mm_i32scatter_ps (void* base_addr, __m128i vindex, __m128 a, const int scale)

Synopsis

void _mm_i32scatter_ps (void* base_addr, __m128i vindex, __m128 a, const int scale)
#include "immintrin.h"
Instruction: vscatterdps
CPUID Flags: AVX512VL + AVX512F

Description

Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i] ENDFOR
vscatterdps
void _mm_mask_i32scatter_ps (void* base_addr, __mmask8 k, __m128i vindex, __m128 a, const int scale)

Synopsis

void _mm_mask_i32scatter_ps (void* base_addr, __mmask8 k, __m128i vindex, __m128 a, const int scale)
#include "immintrin.h"
Instruction: vscatterdps
CPUID Flags: AVX512VL + AVX512F

Description

Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i] k[j] := 0 FI ENDFOR k[MAX:4] := 0
vscatterdps
void _mm256_i32scatter_ps (void* base_addr, __m256i vindex, __m256 a, const int scale)

Synopsis

void _mm256_i32scatter_ps (void* base_addr, __m256i vindex, __m256 a, const int scale)
#include "immintrin.h"
Instruction: vscatterdps
CPUID Flags: AVX512VL + AVX512F

Description

Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i] ENDFOR
vscatterdps
void _mm256_mask_i32scatter_ps (void* base_addr, __mmask8 k, __m256i vindex, __m256 a, const int scale)

Synopsis

void _mm256_mask_i32scatter_ps (void* base_addr, __mmask8 k, __m256i vindex, __m256 a, const int scale)
#include "immintrin.h"
Instruction: vscatterdps
CPUID Flags: AVX512VL + AVX512F

Description

Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i] k[j] := 0 FI ENDFOR k[MAX:8] := 0
vscatterdps
void _mm512_i32scatter_ps (void* base_addr, __m512i vindex, __m512 a, int scale)

Synopsis

void _mm512_i32scatter_ps (void* base_addr, __m512i vindex, __m512 a, int scale)
#include "immintrin.h"
Instruction: vscatterdps vm32z {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 15 i := j*32 MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i] ENDFOR
vscatterdps
void _mm512_mask_i32scatter_ps (void* base_addr, __mmask16 k, __m512i vindex, __m512 a, int scale)

Synopsis

void _mm512_mask_i32scatter_ps (void* base_addr, __mmask16 k, __m512i vindex, __m512 a, int scale)
#include "immintrin.h"
Instruction: vscatterdps vm32z {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i] k[j] := 0 FI ENDFOR k[MAX:16] := 0
...
__m512i _mm512_i64extgather_epi32lo (__m512i index, void const* mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)

Synopsis

__m512i _mm512_i64extgather_epi32lo (__m512i index, void const* mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Up-converts 8 single-precision (32-bit) memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to 32-bit integer elements and stores them in dst. hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*32 CASE conv OF _MM_UPCONV_EPI32_NONE: dst[i+31:i] := addr[i+31:i] _MM_UPCONV_EPI32_UINT8: n := j*8 dst[i+31:i] := UInt8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_SINT8: n := j*8 dst[i+31:i] := SInt8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_UINT16: n := j*16 dst[i+31:i] := UInt16ToInt32(addr[n+15:n]) _MM_UPCONV_EPI32_SINT16: n := j*16 dst[i+31:i] := SInt16ToInt32(addr[n+15:n]) ESAC ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_mask_i64extgather_epi32lo (__m512i src, __mmask8 k, __m512i index, void const* mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)

Synopsis

__m512i _mm512_mask_i64extgather_epi32lo (__m512i src, __mmask8 k, __m512i index, void const* mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Up-converts 8 single-precision (32-bit) memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to 32-bit integer elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*32 IF k[j] CASE conv OF _MM_UPCONV_EPI32_NONE: dst[i+31:i] := addr[i+31:i] _MM_UPCONV_EPI32_UINT8: n := j*8 dst[i+31:i] := UInt8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_SINT8: n := j*8 dst[i+31:i] := SInt8ToInt32(addr[n+7:n]) _MM_UPCONV_EPI32_UINT16: n := j*16 dst[i+31:i] := UInt16ToInt32(addr[n+15:n]) _MM_UPCONV_EPI32_SINT16: n := j*16 dst[i+31:i] := SInt16ToInt32(addr[n+15:n]) ESAC ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_i64extgather_epi64 (__m512i index, void const* mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)

Synopsis

__m512i _mm512_i64extgather_epi64 (__m512i index, void const* mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Up-converts 8 double-precision (64-bit) memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to 64-bit integer elements and stores them in dst. hint indicates to the processor whether the load is non-temporal.

Operation

FOR j := 0 to 7 i := j*64 addr := MEM[mv + index[j] * scale] CASE conv OF _MM_UPCONV_EPI64_NONE: dst[i+63:i] := addr[i+63:i] ESAC ENDFOR dst[MAX:512] := 0
...
__m512i _mm512_mask_i64extgather_epi64 (__m512i src, __mmask8 k, __m512i index, void const* mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)

Synopsis

__m512i _mm512_mask_i64extgather_epi64 (__m512i src, __mmask8 k, __m512i index, void const* mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Up-converts 8 double-precision (64-bit) memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to 64-bit integer elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the load is non-temporal.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] addr := MEM[mv + index[j] * scale] CASE conv OF _MM_UPCONV_EPI64_NONE: dst[i+63:i] := addr[i+63:i] ESAC ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_i64extgather_pd (__m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)

Synopsis

__m512d _mm512_i64extgather_pd (__m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Up-converts 8 double-precision (64-bit) floating-point elements stored in memory starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to 64-bit floating-point elements and stores them in dst. hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 CASE conv OF _MM_UPCONV_PD_NONE: dst[i+63:i] := addr[i+63:i] ESAC ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_i64extgather_pd (__m512d src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)

Synopsis

__m512d _mm512_mask_i64extgather_pd (__m512d src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Up-converts 8 double-precision (64-bit) floating-point elements stored in memory starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to 64-bit floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 IF k[j] CASE conv OF _MM_UPCONV_PD_NONE: dst[i+63:i] := addr[i+63:i] ESAC ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_i64extgather_pslo (__m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)

Synopsis

__m512 _mm512_i64extgather_pslo (__m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Up-converts 8 memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to single-precision (32-bit) floating-point elements and stores them in the lower half of dst. hint indicates to the processor whether the load is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*32 CASE conv OF _MM_UPCONV_PS_NONE: dst[i+31:i] := addr[i+31:i] _MM_UPCONV_PS_FLOAT16: n := j*16 dst[i+31:i] := Float16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_UINT8: n := j*8 dst[i+31:i] := UInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_SINT8: n := j*8 dst[i+31:i] := SInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_UINT16: n := j*16 dst[i+31:i] := UInt16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_SINT16: n := j*16 dst[i+31:i] := SInt16ToFloat32(addr[n+15:n]) ESAC ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_mask_i64extgather_pslo (__m512 src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)

Synopsis

__m512 _mm512_mask_i64extgather_pslo (__m512 src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Up-converts 8 memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to single-precision (32-bit) floating-point elements and stores them in the lower half of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the load is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*32 IF k[j] CASE conv OF _MM_UPCONV_PS_NONE: dst[i+31:i] := addr[i+31:i] _MM_UPCONV_PS_FLOAT16: n := j*16 dst[i+31:i] := Float16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_UINT8: n := j*8 dst[i+31:i] := UInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_SINT8: n := j*8 dst[i+31:i] := SInt8ToFloat32(addr[n+7:n]) _MM_UPCONV_PS_UINT16: n := j*16 dst[i+31:i] := UInt16ToFloat32(addr[n+15:n]) _MM_UPCONV_PS_SINT16: n := j*16 dst[i+31:i] := SInt16ToFloat32(addr[n+15:n]) ESAC ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
...
void _mm512_i64extscatter_epi32lo (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)

Synopsis

void _mm512_i64extscatter_epi32lo (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Down-converts the low 8 packed 32-bit integer elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 CASE conv OF _MM_DOWNCONV_EPI32_NONE: addr[i+31:i] := v1[i+31:i] _MM_DOWNCONV_EPI32_UINT8: n := j*8 addr[n+7:n] := UInt32ToUInt8(v1[i+31:i]) _MM_DOWNCONV_EPI32_SINT8: n := j*8 addr[n+7:n] := SInt32ToSInt8(v1[i+31:i]) _MM_DOWNCONV_EPI32_UINT16: n := j*16 addr[n+15:n] := UInt32ToUInt16(v1[i+31:i]) _MM_DOWNCONV_EPI32_SINT16: n := j*16 addr[n+15:n] := SInt32ToSInt16(v1[n+15:n]) ESAC ENDFOR
...
void _mm512_mask_i64extscatter_epi32lo (void * mv, __mmask8 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)

Synopsis

void _mm512_mask_i64extscatter_epi32lo (void * mv, __mmask8 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Down-converts the low 8 packed 32-bit integer elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. Elements are written to memory using writemask k (elements are only written when the corresponding mask bit is set; otherwise, the memory location is left unchanged). hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 IF k[j] CASE conv OF _MM_DOWNCONV_EPI32_NONE: addr[i+31:i] := v1[i+31:i] _MM_DOWNCONV_EPI32_UINT8: n := j*8 addr[n+7:n] := UInt32ToUInt8(v1[i+31:i]) _MM_DOWNCONV_EPI32_SINT8: n := j*8 addr[n+7:n] := SInt32ToSInt8(v1[i+31:i]) _MM_DOWNCONV_EPI32_UINT16: n := j*16 addr[n+15:n] := UInt32ToUInt16(v1[i+31:i]) _MM_DOWNCONV_EPI32_SINT16: n := j*16 addr[n+15:n] := SInt32ToSInt16(v1[n+15:n]) ESAC FI ENDFOR
...
void _mm512_i64extscatter_epi64 (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)

Synopsis

void _mm512_i64extscatter_epi64 (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Down-converts 8 packed 64-bit integer elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. hint indicates to the processor whether the load is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 CASE conv OF _MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i] ESAC ENDFOR
...
void _mm512_mask_i64extscatter_epi64 (void * mv, __mmask8 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)

Synopsis

void _mm512_mask_i64extscatter_epi64 (void * mv, __mmask8 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Down-converts 8 packed 64-bit integer elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. Only those elements whose corresponding mask bit is set in writemask k are written to memory.

Operation

FOR j := 0 to 7 IF k[j] addr := MEM[mv + index[j] * scale] i := j*64 CASE conv OF _MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i] ESAC FI ENDFOR
...
void _mm512_i64extscatter_pd (void * mv, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)

Synopsis

void _mm512_i64extscatter_pd (void * mv, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Down-converts 8 packed double-precision (64-bit) floating-point elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 CASE conv OF _MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i] ESAC ENDFOR
...
void _mm512_mask_i64extscatter_pd (void * mv, __mmask8 k, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)

Synopsis

void _mm512_mask_i64extscatter_pd (void * mv, __mmask8 k, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Down-converts 8 packed double-precision (64-bit) floating-point elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. Elements are written to memory using writemask k (elements are not stored to memory when the corresponding mask bit is not set; the memory location is left unchagned). hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*64 IF k[j] CASE conv OF _MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i] ESAC FI ENDFOR
...
void _mm512_i64extscatter_pslo (void * mv, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int hint)

Synopsis

void _mm512_i64extscatter_pslo (void * mv, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Down-converts 8 packed single-precision (32-bit) floating-point elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*32 CASE conv OF _MM_DOWNCONV_PS_NONE: addr[i+31:i] := v1[i+31:i] _MM_DOWNCONV_PS_FLOAT16: n := j*16 addr[n+15:n] := Float32ToFloat16(v1[i+31:i]) _MM_DOWNCONV_PS_UINT8: n := j*8 addr[n+7:n] := Float32ToUInt8(v1[i+31:i]) _MM_DOWNCONV_PS_SINT8: n := j*8 addr[n+7:n] := Float32ToSInt8(v1[i+31:i]) _MM_DOWNCONV_PS_UINT16: n := j*16 addr[n+15:n] := Float32ToUInt16(v1[i+31:i]) _MM_DOWNCONV_PS_SINT16: n := j*16 addr[n+15:n] := Float32ToSInt16(v1[i+31:i]) ESAC ENDFOR
...
void _mm512_mask_i64extscatter_pslo (void * mv, __mmask8 k, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int hint)

Synopsis

void _mm512_mask_i64extscatter_pslo (void * mv, __mmask8 k, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Down-converts 8 packed single-precision (32-bit) floating-point elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. Elements are only written when the corresponding mask bit is set in k; otherwise, elements are unchanged in memory. hint indicates to the processor whether the data is non-temporal.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*32 IF k[j] CASE conv OF _MM_DOWNCONV_PS_NONE: addr[i+31:i] := v[i+31:i] _MM_DOWNCONV_PS_FLOAT16: n := j*16 addr[n+15:n] := Float32ToFloat16(v1[i+31:i]) _MM_DOWNCONV_PS_UINT8: n := j*8 addr[n+7:n] := Float32ToUInt8(v1[i+31:i]) _MM_DOWNCONV_PS_SINT8: n := j*8 addr[n+7:n] := Float32ToSInt8(v1[i+31:i]) _MM_DOWNCONV_PS_UINT16: n := j*16 addr[n+15:n] := Float32ToUInt16(v1[i+31:i]) _MM_DOWNCONV_PS_SINT16: n := j*16 addr[n+15:n] := Float32ToSInt16(v1[i+31:i]) ESAC FI ENDFOR
vpgatherqd
__m128i _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale)

Synopsis

__m128i _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherqd xmm, vm32x, xmm
CPUID Flags: AVX2

Description

Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*32 m := j*64 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] ENDFOR dst[MAX:64] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherqd
__m128i _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)

Synopsis

__m128i _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherqd xmm, vm32x, xmm
CPUID Flags: AVX2

Description

Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*32 m := j*64 IF mask[i+31] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] mask[i+31] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR mask[MAX:64] := 0 dst[MAX:64] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherqd
__m128i _mm_mmask_i64gather_epi32 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)

Synopsis

__m128i _mm_mmask_i64gather_epi32 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherqd
CPUID Flags: AVX512VL + AVX512F

Description

Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*32 m := j*64 IF k[j] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] k[j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR k[MAX:2] := 0 dst[MAX:64] := 0
vpgatherqd
__m128i _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale)

Synopsis

__m128i _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherqd ymm, vm32x, ymm
CPUID Flags: AVX2

Description

Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 m := j*64 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] ENDFOR dst[MAX:128] := 0
vpgatherqd
__m128i _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale)

Synopsis

__m128i _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherqd ymm, vm32x, ymm
CPUID Flags: AVX2

Description

Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 m := j*64 IF mask[i+31] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] mask[i+31] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR mask[MAX:128] := 0 dst[MAX:128] := 0
vpgatherqd
__m128i _mm256_mmask_i64gather_epi32 (__m128i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)

Synopsis

__m128i _mm256_mmask_i64gather_epi32 (__m128i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherqd
CPUID Flags: AVX512VL + AVX512F

Description

Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 m := j*64 IF k[j] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] k[j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR k[MAX:4] := 0 dst[MAX:128] := 0
vpgatherqd
__m256i _mm512_i64gather_epi32 (__m512i vindex, void const* base_addr, int scale)

Synopsis

__m256i _mm512_i64gather_epi32 (__m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherqd ymm {k}, vm64z
CPUID Flags: AVX512F

Description

Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 m := j*64 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] ENDFOR dst[MAX:256] := 0
vpgatherqd
__m256i _mm512_mask_i64gather_epi32 (__m256i src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)

Synopsis

__m256i _mm512_mask_i64gather_epi32 (__m256i src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherqd ymm {k}, vm64z
CPUID Flags: AVX512F

Description

Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 m := j*64 IF k[j] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] k[j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR k[MAX:8] := 0 dst[MAX:256] := 0
...
__m512i _mm512_i64gather_epi32lo (__m512i index, void const * mv, int scale)

Synopsis

__m512i _mm512_i64gather_epi32lo (__m512i index, void const * mv, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Loads 8 32-bit integer memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale to dst.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*32 dst[i+31:i] := addr[i+31:i] ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_mask_i64gather_epi32lo (__m512i src, __mmask8 k, __m512i index, void const * mv, int scale)

Synopsis

__m512i _mm512_mask_i64gather_epi32lo (__m512i src, __mmask8 k, __m512i index, void const * mv, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Loads 8 32-bit integer memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] addr := MEM[mv + index[j] * scale] dst[i+31:i] := addr[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpgatherqq
__m128i _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)

Synopsis

__m128i _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherqq xmm, vm64x, xmm
CPUID Flags: AVX2

Description

Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherqq
__m128i _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)

Synopsis

__m128i _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherqq xmm, vm64x, xmm
CPUID Flags: AVX2

Description

Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 IF mask[i+63] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] mask[i+63] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR mask[MAX:128] := 0 dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherqq
__m128i _mm_mmask_i64gather_epi64 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)

Synopsis

__m128i _mm_mmask_i64gather_epi64 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherqq
CPUID Flags: AVX512VL + AVX512F

Description

Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] k[j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR k[MAX:2] := 0 dst[MAX:128] := 0
vpgatherqq
__m256i _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale)

Synopsis

__m256i _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherqq ymm, vm64x, ymm
CPUID Flags: AVX2

Description

Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherqq
__m256i _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale)

Synopsis

__m256i _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherqq ymm, vm64x, ymm
CPUID Flags: AVX2

Description

Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 IF mask[i+63] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] mask[i+63] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR mask[MAX:256] := 0 dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vpgatherqq
__m256i _mm256_mmask_i64gather_epi64 (__m256i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)

Synopsis

__m256i _mm256_mmask_i64gather_epi64 (__m256i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherqq
CPUID Flags: AVX512VL + AVX512F

Description

Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] k[j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR k[MAX:4] := 0 dst[MAX:256] := 0
vpgatherqq
__m512i _mm512_i64gather_epi64 (__m512i vindex, void const* base_addr, int scale)

Synopsis

__m512i _mm512_i64gather_epi64 (__m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherqq zmm {k}, vm64z
CPUID Flags: AVX512F

Description

Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] ENDFOR dst[MAX:512] := 0
vpgatherqq
__m512i _mm512_mask_i64gather_epi64 (__m512i src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)

Synopsis

__m512i _mm512_mask_i64gather_epi64 (__m512i src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherqq zmm {k}, vm64z
CPUID Flags: AVX512F

Description

Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] k[j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR k[MAX:8] := 0 dst[MAX:512] := 0
vgatherqpd
__m128d _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale)

Synopsis

__m128d _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherqpd xmm, vm64x, xmm
CPUID Flags: AVX2

Description

Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherqpd
__m128d _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)

Synopsis

__m128d _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)
#include "immintrin.h"
Instruction: vgatherqpd xmm, vm64x, xmm
CPUID Flags: AVX2

Description

Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 IF mask[i+63] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] mask[i+63] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR mask[MAX:128] := 0 dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherqpd
__m128d _mm_mmask_i64gather_pd (__m128d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)

Synopsis

__m128d _mm_mmask_i64gather_pd (__m128d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherqpd
CPUID Flags: AVX512VL + AVX512F

Description

Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] k[j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR k[MAX:2] := 0 dst[MAX:128] := 0
vgatherqpd
__m256d _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale)

Synopsis

__m256d _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherqpd ymm, vm64x, ymm
CPUID Flags: AVX2

Description

Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherqpd
__m256d _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale)

Synopsis

__m256d _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale)
#include "immintrin.h"
Instruction: vgatherqpd ymm, vm64x, ymm
CPUID Flags: AVX2

Description

Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 IF mask[i+63] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] mask[i+63] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR mask[MAX:256] := 0 dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherqpd
__m256d _mm256_mmask_i64gather_pd (__m256d src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)

Synopsis

__m256d _mm256_mmask_i64gather_pd (__m256d src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherqpd
CPUID Flags: AVX512VL + AVX512F

Description

Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] k[j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR k[MAX:4] := 0 dst[MAX:256] := 0
vgatherqpd
__m512d _mm512_i64gather_pd (__m512i vindex, void const* base_addr, int scale)

Synopsis

__m512d _mm512_i64gather_pd (__m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherqpd zmm {k}, vm32z
CPUID Flags: AVX512F

Description

Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] ENDFOR dst[MAX:512] := 0
vgatherqpd
__m512d _mm512_mask_i64gather_pd (__m512d src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)

Synopsis

__m512d _mm512_mask_i64gather_pd (__m512d src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherqpd zmm {k}, vm32z
CPUID Flags: AVX512F

Description

Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] k[j] := 0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR k[MAX:8] := 0 dst[MAX:512] := 0
vgatherqps
__m128 _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale)

Synopsis

__m128 _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherqps xmm, vm32x, xmm
CPUID Flags: AVX2

Description

Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*32 m := j*64 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] ENDFOR dst[MAX:64] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherqps
__m128 _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)

Synopsis

__m128 _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)
#include "immintrin.h"
Instruction: vgatherqps xmm, vm32x, xmm
CPUID Flags: AVX2

Description

Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*32 m := j*64 IF mask[i+31] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] mask[i+31] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR mask[MAX:64] := 0 dst[MAX:64] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherqps
__m128 _mm_mmask_i64gather_ps (__m128 src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)

Synopsis

__m128 _mm_mmask_i64gather_ps (__m128 src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherqps
CPUID Flags: AVX512VL + AVX512F

Description

Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*32 m := j*64 IF k[j] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] k[j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR k[MAX:2] := 0 dst[MAX:64] := 0
vgatherqps
__m128 _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale)

Synopsis

__m128 _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherqps ymm, vm32x, ymm
CPUID Flags: AVX2

Description

Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 m := j*64 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherqps
__m128 _mm256_mask_i64gather_ps (__m128 src, float const* base_addr, __m256i vindex, __m128 mask, const int scale)

Synopsis

__m128 _mm256_mask_i64gather_ps (__m128 src, float const* base_addr, __m256i vindex, __m128 mask, const int scale)
#include "immintrin.h"
Instruction: vgatherqps ymm, vm32x, ymm
CPUID Flags: AVX2

Description

Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 m := j*64 IF mask[i+31] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] mask[i+31] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR mask[MAX:128] := 0 dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell6-
vgatherqps
__m128 _mm256_mmask_i64gather_ps (__m128 src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)

Synopsis

__m128 _mm256_mmask_i64gather_ps (__m128 src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherqps
CPUID Flags: AVX512VL + AVX512F

Description

Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 m := j*64 IF k[j] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] k[j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR k[MAX:4] := 0 dst[MAX:128] := 0
vgatherqps
__m256 _mm512_i64gather_ps (__m512i vindex, void const* base_addr, int scale)

Synopsis

__m256 _mm512_i64gather_ps (__m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherqps ymm {k}, vm64z
CPUID Flags: AVX512F

Description

Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 m := j*64 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] ENDFOR dst[MAX:256] := 0
vgatherqps
__m256 _mm512_mask_i64gather_ps (__m256 src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)

Synopsis

__m256 _mm512_mask_i64gather_ps (__m256 src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherqps ymm {k}, vm64z
CPUID Flags: AVX512F

Description

Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 m := j*64 IF k[j] dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale] k[j] := 0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR k[MAX:8] := 0 dst[MAX:256] := 0
...
__m512 _mm512_i64gather_pslo (__m512i index, void const * mv, int scale)

Synopsis

__m512 _mm512_i64gather_pslo (__m512i index, void const * mv, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Loads 8 single-precision (32-bit) floating-point memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale to dst.

Operation

FOR j := 0 to 7 addr := MEM[mv + index[j] * scale] i := j*32 dst[i+31:i] := addr[i+31:i] ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_mask_i64gather_pslo (__m512 src, __mmask8 k, __m512i index, void const * mv, int scale)

Synopsis

__m512 _mm512_mask_i64gather_pslo (__m512 src, __mmask8 k, __m512i index, void const * mv, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Loads 8 single-precision (32-bit) floating-point memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] addr := MEM[mv + index[j] * scale] dst[i+31:i] := addr[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpscatterqd
void _mm_i64scatter_epi32 (void* base_addr, __m128i vindex, __m128i a, const int scale)

Synopsis

void _mm_i64scatter_epi32 (void* base_addr, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*32 l := j*64 MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i] ENDFOR
vpscatterqd
void _mm_mask_i64scatter_epi32 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)

Synopsis

void _mm_mask_i64scatter_epi32 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*32 l := j*64 IF k[j] MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i] k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpscatterqd
void _mm256_i64scatter_epi32 (void* base_addr, __m256i vindex, __m128i a, const int scale)

Synopsis

void _mm256_i64scatter_epi32 (void* base_addr, __m256i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 l := j*64 MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i] ENDFOR
vpscatterqd
void _mm256_mask_i64scatter_epi32 (void* base_addr, __mmask8 k, __m256i vindex, __m128i a, const int scale)

Synopsis

void _mm256_mask_i64scatter_epi32 (void* base_addr, __mmask8 k, __m256i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 l := j*64 IF k[j] MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i] k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpscatterqd
void _mm512_i64scatter_epi32 (void* base_addr, __m512i vindex, __m256i a, int scale)

Synopsis

void _mm512_i64scatter_epi32 (void* base_addr, __m512i vindex, __m256i a, int scale)
#include "immintrin.h"
Instruction: vpscatterqd vm64z {k}, ymm
CPUID Flags: AVX512F

Description

Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 l := j*64 MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i] ENDFOR
vpscatterqd
void _mm512_mask_i64scatter_epi32 (void* base_addr, __mmask8 k, __m512i vindex, __m256i a, int scale)

Synopsis

void _mm512_mask_i64scatter_epi32 (void* base_addr, __mmask8 k, __m512i vindex, __m256i a, int scale)
#include "immintrin.h"
Instruction: vpscatterqd vm64z {k}, ymm
CPUID Flags: AVX512F

Description

Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i] k[j] := 0 FI ENDFOR k[MAX:8] := 0
...
void _mm512_i64scatter_epi32lo (void * mv, __m512i index, __m512i v1, int scale)

Synopsis

void _mm512_i64scatter_epi32lo (void * mv, __m512i index, __m512i v1, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Stores 8 packed 32-bit integer elements in v1 in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale.

Operation

FOR j := 0 to 7 i := j*32 addr := MEM[mv + index[j] * scale] addr[i+31:i] := v1[i+31:i] ENDFOR
...
void _mm512_mask_i64scatter_epi32lo (void * mv, __mmask8 k, __m512i index, __m512i v1, int scale)

Synopsis

void _mm512_mask_i64scatter_epi32lo (void * mv, __mmask8 k, __m512i index, __m512i v1, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Stores 8 packed 32-bit integer elements in v1 in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using writemask k (elements are only written to memory when the corresponding mask bit is set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] addr := MEM[mv + index[j] * scale] addr[i+31:i] := v1[i+31:i] FI ENDFOR
vpscatterqq
void _mm_i64scatter_epi64 (void* base_addr, __m128i vindex, __m128i a, const int scale)

Synopsis

void _mm_i64scatter_epi64 (void* base_addr, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqq
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i] ENDFOR
vpscatterqq
void _mm_mask_i64scatter_epi64 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)

Synopsis

void _mm_mask_i64scatter_epi64 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqq
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i] k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpscatterqq
void _mm256_i64scatter_epi64 (void* base_addr, __m256i vindex, __m256i a, const int scale)

Synopsis

void _mm256_i64scatter_epi64 (void* base_addr, __m256i vindex, __m256i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqq
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i] ENDFOR
vpscatterqq
void _mm256_mask_i64scatter_epi64 (void* base_addr, __mmask8 k, __m256i vindex, __m256i a, const int scale)

Synopsis

void _mm256_mask_i64scatter_epi64 (void* base_addr, __mmask8 k, __m256i vindex, __m256i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqq
CPUID Flags: AVX512VL + AVX512F

Description

Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i] k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpscatterqq
void _mm512_i64scatter_epi64 (void* base_addr, __m512i vindex, __m512i a, int scale)

Synopsis

void _mm512_i64scatter_epi64 (void* base_addr, __m512i vindex, __m512i a, int scale)
#include "immintrin.h"
Instruction: vpscatterqq vm64z {k}, zmm
CPUID Flags: AVX512F

Description

Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i] ENDFOR
vpscatterqq
void _mm512_mask_i64scatter_epi64 (void* base_addr, __mmask8 k, __m512i vindex, __m512i a, int scale)

Synopsis

void _mm512_mask_i64scatter_epi64 (void* base_addr, __mmask8 k, __m512i vindex, __m512i a, int scale)
#include "immintrin.h"
Instruction: vpscatterqq vm64z {k}, zmm
CPUID Flags: AVX512F

Description

Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i] k[j] := 0 FI ENDFOR k[MAX:8] := 0
vscatterqpd
void _mm_i64scatter_pd (void* base_addr, __m128i vindex, __m128d a, const int scale)

Synopsis

void _mm_i64scatter_pd (void* base_addr, __m128i vindex, __m128d a, const int scale)
#include "immintrin.h"
Instruction: vscatterqpd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i] ENDFOR
vscatterqpd
void _mm_mask_i64scatter_pd (void* base_addr, __mmask8 k, __m128i vindex, __m128d a, const int scale)

Synopsis

void _mm_mask_i64scatter_pd (void* base_addr, __mmask8 k, __m128i vindex, __m128d a, const int scale)
#include "immintrin.h"
Instruction: vscatterqpd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i] k[j] := 0 FI ENDFOR k[MAX:2] := 0
vscatterqpd
void _mm256_i64scatter_pd (void* base_addr, __m256i vindex, __m256d a, const int scale)

Synopsis

void _mm256_i64scatter_pd (void* base_addr, __m256i vindex, __m256d a, const int scale)
#include "immintrin.h"
Instruction: vscatterqpd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i] ENDFOR
vscatterqpd
void _mm256_mask_i64scatter_pd (void* base_addr, __mmask8 k, __m256i vindex, __m256d a, const int scale)

Synopsis

void _mm256_mask_i64scatter_pd (void* base_addr, __mmask8 k, __m256i vindex, __m256d a, const int scale)
#include "immintrin.h"
Instruction: vscatterqpd
CPUID Flags: AVX512VL + AVX512F

Description

Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i] k[j] := 0 FI ENDFOR k[MAX:4] := 0
vscatterqpd
void _mm512_i64scatter_pd (void* base_addr, __m512i vindex, __m512d a, int scale)

Synopsis

void _mm512_i64scatter_pd (void* base_addr, __m512i vindex, __m512d a, int scale)
#include "immintrin.h"
Instruction: vscatterqpd vm32z {k}, zmm
CPUID Flags: AVX512F

Description

Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i] ENDFOR
vscatterqpd
void _mm512_mask_i64scatter_pd (void* base_addr, __mmask8 k, __m512i vindex, __m512d a, int scale)

Synopsis

void _mm512_mask_i64scatter_pd (void* base_addr, __mmask8 k, __m512i vindex, __m512d a, int scale)
#include "immintrin.h"
Instruction: vscatterqpd vm32z {k}, zmm
CPUID Flags: AVX512F

Description

Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i] k[j] := 0 FI ENDFOR k[MAX:8] := 0
vscatterqps
void _mm_i64scatter_ps (void* base_addr, __m128i vindex, __m128 a, const int scale)

Synopsis

void _mm_i64scatter_ps (void* base_addr, __m128i vindex, __m128 a, const int scale)
#include "immintrin.h"
Instruction: vscatterqps
CPUID Flags: AVX512VL + AVX512F

Description

Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*32 l := j*64 MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i] ENDFOR
vscatterqps
void _mm_mask_i64scatter_ps (void* base_addr, __mmask8 k, __m128i vindex, __m128 a, const int scale)

Synopsis

void _mm_mask_i64scatter_ps (void* base_addr, __mmask8 k, __m128i vindex, __m128 a, const int scale)
#include "immintrin.h"
Instruction: vscatterqps
CPUID Flags: AVX512VL + AVX512F

Description

Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 1 i := j*32 l := j*64 IF k[j] MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i] k[j] := 0 FI ENDFOR k[MAX:2] := 0
vscatterqps
void _mm256_i64scatter_ps (void* base_addr, __m256i vindex, __m128 a, const int scale)

Synopsis

void _mm256_i64scatter_ps (void* base_addr, __m256i vindex, __m128 a, const int scale)
#include "immintrin.h"
Instruction: vscatterqps
CPUID Flags: AVX512VL + AVX512F

Description

Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 l := j*64 MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i] ENDFOR
vscatterqps
void _mm256_mask_i64scatter_ps (void* base_addr, __mmask8 k, __m256i vindex, __m128 a, const int scale)

Synopsis

void _mm256_mask_i64scatter_ps (void* base_addr, __mmask8 k, __m256i vindex, __m128 a, const int scale)
#include "immintrin.h"
Instruction: vscatterqps
CPUID Flags: AVX512VL + AVX512F

Description

Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 3 i := j*32 l := j*64 IF k[j] MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i] k[j] := 0 FI ENDFOR k[MAX:4] := 0
vscatterqps
void _mm512_i64scatter_ps (void* base_addr, __m512i vindex, __m256 a, int scale)

Synopsis

void _mm512_i64scatter_ps (void* base_addr, __m512i vindex, __m256 a, int scale)
#include "immintrin.h"
Instruction: vscatterqps vm32z {k}, ymm
CPUID Flags: AVX512F

Description

Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 l := j*64 MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i] ENDFOR
vscatterqps
void _mm512_mask_i64scatter_ps (void* base_addr, __mmask8 k, __m512i vindex, __m256 a, int scale)

Synopsis

void _mm512_mask_i64scatter_ps (void* base_addr, __mmask8 k, __m512i vindex, __m256 a, int scale)
#include "immintrin.h"
Instruction: vscatterqps vm32z {k}, ymm
CPUID Flags: AVX512F

Description

Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i] k[j] := 0 FI ENDFOR k[MAX:8] := 0
...
void _mm512_i64scatter_pslo (void * mv, __m512i index, __m512 v, int scale)

Synopsis

void _mm512_i64scatter_pslo (void * mv, __m512i index, __m512 v, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Stores 8 packed single-precision (32-bit) floating-point elements in v in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale.

Operation

FOR j := 0 to 7 i := j*32 addr := MEM[mv + index[j] * scale] addr[i+31:i] := v[i+31:i] ENDFOR
...
void _mm512_mask_i64scatter_pslo (void * mv, __mmask8 k, __m512i index, __m512 v1, int scale)

Synopsis

void _mm512_mask_i64scatter_pslo (void * mv, __mmask8 k, __m512i index, __m512 v1, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Stores 8 packed single-precision (32-bit) floating-point elements in v1 in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using writemask k (elements are only written to memory when the corresponding mask bit is set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] addr := MEM[mv + index[j] * scale] addr[i+31:i] := v1[i+31:i] FI ENDFOR
...
__m128i _mm_idiv_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_idiv_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed 32-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_idiv_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_idiv_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed 32-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m128i _mm_idivrem_epi32 (__m128i * mem_addr, __m128i a, __m128i b)

Synopsis

__m128i _mm_idivrem_epi32 (__m128i * mem_addr, __m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed 32-bit integers in a by packed elements in b, store the truncated results in dst, and store the remainders as packed 32-bit integers into memory at mem_addr.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_idivrem_epi32 (__m256i * mem_addr, __m256i a, __m256i b)

Synopsis

__m256i _mm256_idivrem_epi32 (__m256i * mem_addr, __m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed 32-bit integers in a by packed elements in b, store the truncated results in dst, and store the remainders as packed 32-bit integers into memory at mem_addr.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:256] := 0
pinsrw
__m128i _mm_insert_epi16 (__m128i a, int i, int imm8)

Synopsis

__m128i _mm_insert_epi16 (__m128i a, int i, int imm8)
#include "emmintrin.h"
Instruction: pinsrw xmm, r32, imm
CPUID Flags: SSE2

Description

Copy a to dst, and insert the 16-bit integer i into dst at the location specified by imm8.

Operation

dst[127:0] := a[127:0] sel := imm8[2:0]*16 dst[sel+15:sel] := i[15:0]

Performance

ArchitectureLatencyThroughput
Haswell22
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
...
__m256i _mm256_insert_epi16 (__m256i a, __int16 i, const int index)

Synopsis

__m256i _mm256_insert_epi16 (__m256i a, __int16 i, const int index)
#include "immintrin.h"
CPUID Flags: AVX

Description

Copy a to dst, and insert the 16-bit integer i into dst at the location specified by index.

Operation

dst[255:0] := a[255:0] sel := index*16 dst[sel+15:sel] := i[15:0]
pinsrd
__m128i _mm_insert_epi32 (__m128i a, int i, const int imm8)

Synopsis

__m128i _mm_insert_epi32 (__m128i a, int i, const int imm8)
#include "smmintrin.h"
Instruction: pinsrd xmm, r32, imm
CPUID Flags: SSE4.1

Description

Copy a to dst, and insert the 32-bit integer i into dst at the location specified by imm8.

Operation

dst[127:0] := a[127:0] sel := imm8[1:0]*32 dst[sel+31:sel] := i[31:0]

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
Westmere1-
Nehalem1-
...
__m256i _mm256_insert_epi32 (__m256i a, __int32 i, const int index)

Synopsis

__m256i _mm256_insert_epi32 (__m256i a, __int32 i, const int index)
#include "immintrin.h"
CPUID Flags: AVX

Description

Copy a to dst, and insert the 32-bit integer i into dst at the location specified by index.

Operation

dst[255:0] := a[255:0] sel := index*32 dst[sel+31:sel] := i[31:0]
pinsrq
__m128i _mm_insert_epi64 (__m128i a, __int64 i, const int imm8)

Synopsis

__m128i _mm_insert_epi64 (__m128i a, __int64 i, const int imm8)
#include "smmintrin.h"
Instruction: pinsrq xmm, r64, imm
CPUID Flags: SSE4.1

Description

Copy a to dst, and insert the 64-bit integer i into dst at the location specified by imm8.

Operation

dst[127:0] := a[127:0] sel := imm8[0]*64 dst[sel+63:sel] := i[63:0]

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
Westmere1-
Nehalem1-
...
__m256i _mm256_insert_epi64 (__m256i a, __int64 i, const int index)

Synopsis

__m256i _mm256_insert_epi64 (__m256i a, __int64 i, const int index)
#include "immintrin.h"
CPUID Flags: AVX

Description

Copy a to dst, and insert the 64-bit integer i into dst at the location specified by index.

Operation

dst[255:0] := a[255:0] sel := index*64 dst[sel+63:sel] := i[63:0]
pinsrb
__m128i _mm_insert_epi8 (__m128i a, int i, const int imm8)

Synopsis

__m128i _mm_insert_epi8 (__m128i a, int i, const int imm8)
#include "smmintrin.h"
Instruction: pinsrb xmm, r32, imm
CPUID Flags: SSE4.1

Description

Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.

Operation

dst[127:0] := a[127:0] sel := imm8[3:0]*8 dst[sel+7:sel] := i[7:0]

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
Westmere1-
Nehalem1-
...
__m256i _mm256_insert_epi8 (__m256i a, __int8 i, const int index)

Synopsis

__m256i _mm256_insert_epi8 (__m256i a, __int8 i, const int index)
#include "immintrin.h"
CPUID Flags: AVX

Description

Copy a to dst, and insert the 8-bit integer i into dst at the location specified by index.

Operation

dst[255:0] := a[255:0] sel := index*8 dst[sel+7:sel] := i[7:0]
pinsrw
__m64 _mm_insert_pi16 (__m64 a, int i, int imm8)

Synopsis

__m64 _mm_insert_pi16 (__m64 a, int i, int imm8)
#include "xmmintrin.h"
Instruction: pinsrw xmm, r32, imm
CPUID Flags: SSE

Description

Copy a to dst, and insert the 16-bit integer i into dst at the location specified by imm8.

Operation

dst[63:0] := a[63:0] sel := imm8[1:0]*16 dst[sel+15:sel] := i[15:0]

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
Westmere1-
Nehalem1-
insertps
__m128 _mm_insert_ps (__m128 a, __m128 b, const int imm8)

Synopsis

__m128 _mm_insert_ps (__m128 a, __m128 b, const int imm8)
#include "smmintrin.h"
Instruction: insertps xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Copy a to tmp, then insert a single-precision (32-bit) floating-point element from b into tmp using the control in imm8. Store tmp to dst using the mask in imm8 (elements are zeroed out when the corresponding bit is set).

Operation

tmp2[127:0] := a[127:0] CASE (imm8[7:6]) of 0: tmp1[31:0] := b[31:0] 1: tmp1[31:0] := b[63:32] 2: tmp1[31:0] := b[95:64] 3: tmp1[31:0] := b[127:96] ESAC CASE (imm8[5:4]) of 0: tmp2[31:0] := tmp1[31:0] 1: tmp2[63:32] := tmp1[31:0] 2: tmp2[95:64] := tmp1[31:0] 3: tmp2[127:96] := tmp1[31:0] ESAC FOR j := 0 to 3 i := j*32 IF imm8[j%8] dst[i+31:i] := 0 ELSE dst[i+31:i] := tmp2[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vinsertf128
__m256d _mm256_insertf128_pd (__m256d a, __m128d b, int imm8)

Synopsis

__m256d _mm256_insertf128_pd (__m256d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.

Operation

dst[255:0] := a[255:0] CASE imm8[7:0] of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] ESAC dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vinsertf128
__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)

Synopsis

__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.

Operation

dst[255:0] := a[255:0] CASE (imm8[1:0]) of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] ESAC dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vinsertf128
__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)

Synopsis

__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Copy a to dst, then insert 128 bits from b into dst at the location specified by imm8.

Operation

dst[255:0] := a[255:0] CASE (imm8[1:0]) of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] ESAC dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vinsertf32x4
__m256 _mm256_insertf32x4 (__m256 a, __m128 b, int imm8)

Synopsis

__m256 _mm256_insertf32x4 (__m256 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x4
CPUID Flags: AVX512VL + AVX512F

Description

Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.

Operation

dst[255:0] := a[255:0] CASE (imm8[1:0]) of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] ESAC dst[MAX:256] := 0
vinsertf32x4
__m256 _mm256_mask_insertf32x4 (__m256 src, __mmask8 k, __m256 a, __m128 b, int imm8)

Synopsis

__m256 _mm256_mask_insertf32x4 (__m256 src, __mmask8 k, __m256 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x4
CPUID Flags: AVX512VL + AVX512F

Description

Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[255:0] := a[255:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] ESAC FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vinsertf32x4
__m256 _mm256_maskz_insertf32x4 (__mmask8 k, __m256 a, __m128 b, int imm8)

Synopsis

__m256 _mm256_maskz_insertf32x4 (__mmask8 k, __m256 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x4
CPUID Flags: AVX512VL + AVX512F

Description

Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[255:0] := a[255:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] ESAC FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vinsertf32x4
__m512 _mm512_insertf32x4 (__m512 a, __m128 b, int imm8)

Synopsis

__m512 _mm512_insertf32x4 (__m512 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x4 zmm {k}, zmm, xmm, imm
CPUID Flags: AVX512F

Description

Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.

Operation

dst[511:0] := a[511:0] CASE (imm8[1:0]) of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] 2: dst[383:256] := b[127:0] 3: dst[511:384] := b[127:0] ESAC dst[MAX:512] := 0
vinsertf32x4
__m512 _mm512_mask_insertf32x4 (__m512 src, __mmask16 k, __m512 a, __m128 b, int imm8)

Synopsis

__m512 _mm512_mask_insertf32x4 (__m512 src, __mmask16 k, __m512 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x4 zmm {k}, zmm, xmm, imm
CPUID Flags: AVX512F

Description

Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] 2: tmp[383:256] := b[127:0] 3: tmp[511:384] := b[127:0] ESAC FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vinsertf32x4
__m512 _mm512_maskz_insertf32x4 (__mmask16 k, __m512 a, __m128 b, int imm8)

Synopsis

__m512 _mm512_maskz_insertf32x4 (__mmask16 k, __m512 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x4 zmm {k}, zmm, xmm, imm
CPUID Flags: AVX512F

Description

Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] 2: tmp[383:256] := b[127:0] 3: tmp[511:384] := b[127:0] ESAC FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vinsertf32x8
__m512 _mm512_insertf32x8 (__m512 a, __m256 b, int imm8)

Synopsis

__m512 _mm512_insertf32x8 (__m512 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x8
CPUID Flags: AVX512DQ

Description

Copy a to dst, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.

Operation

dst[511:0] := a[511:0] CASE (imm8[7:0]) OF 0: dst[255:0] := b[255:0] 1: dst[511:256] := b[255:0] ESAC dst[MAX:512] := 0
vinsertf32x8
__m512 _mm512_mask_insertf32x8 (__m512 src, __mmask16 k, __m512 a, __m256 b, int imm8)

Synopsis

__m512 _mm512_mask_insertf32x8 (__m512 src, __mmask16 k, __m512 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x8
CPUID Flags: AVX512DQ

Description

Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[7:0]) OF 0: tmp[255:0] := b[255:0] 1: tmp[511:256] := b[255:0] ESAC FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vinsertf32x8
__m512 _mm512_maskz_insertf32x8 (__mmask16 k, __m512 a, __m256 b, int imm8)

Synopsis

__m512 _mm512_maskz_insertf32x8 (__mmask16 k, __m512 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x8
CPUID Flags: AVX512DQ

Description

Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[7:0]) OF 0: tmp[255:0] := b[255:0] 1: tmp[511:256] := b[255:0] ESAC FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vinsertf64x2
__m256d _mm256_insertf64x2 (__m256d a, __m128d b, int imm8)

Synopsis

__m256d _mm256_insertf64x2 (__m256d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.

Operation

dst[255:0] := a[255:0] CASE imm8[7:0] of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] ESAC dst[MAX:256] := 0
vinsertf64x2
__m256d _mm256_mask_insertf64x2 (__m256d src, __mmask8 k, __m256d a, __m128d b, int imm8)

Synopsis

__m256d _mm256_mask_insertf64x2 (__m256d src, __mmask8 k, __m256d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[255:0] := a[255:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] ESAC FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vinsertf64x2
__m256d _mm256_maskz_insertf64x2 (__mmask8 k, __m256d a, __m128d b, int imm8)

Synopsis

__m256d _mm256_maskz_insertf64x2 (__mmask8 k, __m256d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[255:0] := a[255:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] ESAC FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vinsertf64x2
__m512d _mm512_insertf64x2 (__m512d a, __m128d b, int imm8)

Synopsis

__m512d _mm512_insertf64x2 (__m512d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x2
CPUID Flags: AVX512DQ

Description

Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.

Operation

dst[511:0] := a[511:0] CASE imm8[7:0] of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] 2: dst[383:256] := b[127:0] 3: dst[511:384] := b[127:0] ESAC dst[MAX:512] := 0
vinsertf64x2
__m512d _mm512_mask_insertf64x2 (__m512d src, __mmask8 k, __m512d a, __m128d b, int imm8)

Synopsis

__m512d _mm512_mask_insertf64x2 (__m512d src, __mmask8 k, __m512d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x2
CPUID Flags: AVX512DQ

Description

Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] 2: tmp[383:256] := b[127:0] 3: tmp[511:384] := b[127:0] ESAC FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vinsertf64x2
__m512d _mm512_maskz_insertf64x2 (__mmask8 k, __m512d a, __m128d b, int imm8)

Synopsis

__m512d _mm512_maskz_insertf64x2 (__mmask8 k, __m512d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x2
CPUID Flags: AVX512DQ

Description

Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] 2: tmp[383:256] := b[127:0] 3: tmp[511:384] := b[127:0] ESAC FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vinsertf64x4
__m512d _mm512_insertf64x4 (__m512d a, __m256d b, int imm8)

Synopsis

__m512d _mm512_insertf64x4 (__m512d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x4 zmm {k}, zmm, ymm, imm
CPUID Flags: AVX512F

Description

Copy a to dst, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.

Operation

dst[511:0] := a[511:0] CASE (imm8[0]) of 0: dst[255:0] := b[255:0] 1: dst[511:256] := b[255:0] ESAC dst[MAX:512] := 0
vinsertf64x4
__m512d _mm512_mask_insertf64x4 (__m512d src, __mmask8 k, __m512d a, __m256d b, int imm8)

Synopsis

__m512d _mm512_mask_insertf64x4 (__m512d src, __mmask8 k, __m512d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x4 zmm {k}, zmm, ymm, imm
CPUID Flags: AVX512F

Description

Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[0]) of 0: tmp[255:0] := b[255:0] 1: tmp[511:256] := b[255:0] ESAC FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vinsertf64x4
__m512d _mm512_maskz_insertf64x4 (__mmask8 k, __m512d a, __m256d b, int imm8)

Synopsis

__m512d _mm512_maskz_insertf64x4 (__mmask8 k, __m512d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x4 zmm {k}, zmm, ymm, imm
CPUID Flags: AVX512F

Description

Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[0]) of 0: tmp[255:0] := b[255:0] 1: tmp[511:256] := b[255:0] ESAC FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vinserti128
__m256i _mm256_inserti128_si256 (__m256i a, __m128i b, const int imm8)

Synopsis

__m256i _mm256_inserti128_si256 (__m256i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vinserti128 ymm, ymm, xmm, imm
CPUID Flags: AVX2

Description

Copy a to dst, then insert 128 bits (composed of integer data) from b into dst at the location specified by imm8.

Operation

dst[255:0] := a[255:0] CASE (imm8[1:0]) of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] ESAC dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vinserti32x4
__m256i _mm256_inserti32x4 (__m256i a, __m128i b, int imm8)

Synopsis

__m256i _mm256_inserti32x4 (__m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x4
CPUID Flags: AVX512VL + AVX512F

Description

Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.

Operation

dst[255:0] := a[255:0] CASE (imm8[1:0]) of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] ESAC dst[MAX:256] := 0
vinserti32x4
__m256i _mm256_mask_inserti32x4 (__m256i src, __mmask8 k, __m256i a, __m128i b, int imm8)

Synopsis

__m256i _mm256_mask_inserti32x4 (__m256i src, __mmask8 k, __m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x4
CPUID Flags: AVX512VL + AVX512F

Description

Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[255:0] := a[255:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] ESAC FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vinserti32x4
__m256i _mm256_maskz_inserti32x4 (__mmask8 k, __m256i a, __m128i b, int imm8)

Synopsis

__m256i _mm256_maskz_inserti32x4 (__mmask8 k, __m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x4
CPUID Flags: AVX512VL + AVX512F

Description

Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[255:0] := a[255:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] ESAC FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vinserti32x4
__m512i _mm512_inserti32x4 (__m512i a, __m128i b, int imm8)

Synopsis

__m512i _mm512_inserti32x4 (__m512i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x4 zmm {k}, zmm, xmm, imm
CPUID Flags: AVX512F

Description

Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.

Operation

dst[511:0] := a[511:0] CASE (imm8[1:0]) of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] 2: dst[383:256] := b[127:0] 3: dst[511:384] := b[127:0] ESAC dst[MAX:512] := 0
vinserti32x4
__m512i _mm512_mask_inserti32x4 (__m512i src, __mmask16 k, __m512i a, __m128i b, int imm8)

Synopsis

__m512i _mm512_mask_inserti32x4 (__m512i src, __mmask16 k, __m512i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x4 zmm {k}, zmm, xmm, imm
CPUID Flags: AVX512F

Description

Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] 2: tmp[383:256] := b[127:0] 3: tmp[511:384] := b[127:0] ESAC FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vinserti32x4
__m512i _mm512_maskz_inserti32x4 (__mmask16 k, __m512i a, __m128i b, int imm8)

Synopsis

__m512i _mm512_maskz_inserti32x4 (__mmask16 k, __m512i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x4 zmm {k}, zmm, xmm, imm
CPUID Flags: AVX512F

Description

Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] 2: tmp[383:256] := b[127:0] 3: tmp[511:384] := b[127:0] ESAC FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vinserti32x8
__m512i _mm512_inserti32x8 (__m512i a, __m256i b, int imm8)

Synopsis

__m512i _mm512_inserti32x8 (__m512i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x8
CPUID Flags: AVX512DQ

Description

Copy a to dst, then insert 256 bits (composed of 8 packed 32-bit integers) from b into dst at the location specified by imm8.

Operation

dst[511:0] := a[511:0] CASE imm8[7:0] of 0: dst[255:0] := b[255:0] 1: dst[511:256] := b[255:0] ESAC dst[MAX:512] := 0
vinserti32x8
__m512i _mm512_mask_inserti32x8 (__m512i src, __mmask16 k, __m512i a, __m256i b, int imm8)

Synopsis

__m512i _mm512_mask_inserti32x8 (__m512i src, __mmask16 k, __m512i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x8
CPUID Flags: AVX512DQ

Description

Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[7:0]) OF 0: tmp[255:0] := b[255:0] 1: tmp[511:256] := b[255:0] ESAC FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vinserti32x8
__m512i _mm512_maskz_inserti32x8 (__mmask16 k, __m512i a, __m256i b, int imm8)

Synopsis

__m512i _mm512_maskz_inserti32x8 (__mmask16 k, __m512i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x8
CPUID Flags: AVX512DQ

Description

Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[7:0]) OF 0: tmp[255:0] := b[255:0] 1: tmp[511:256] := b[255:0] ESAC FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vinserti64x2
__m256i _mm256_inserti64x2 (__m256i a, __m128i b, int imm8)

Synopsis

__m256i _mm256_inserti64x2 (__m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the location specified by imm8.

Operation

dst[255:0] := a[255:0] CASE imm8[7:0] of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] ESAC dst[MAX:256] := 0
vinserti64x2
__m256i _mm256_mask_inserti64x2 (__m256i src, __mmask8 k, __m256i a, __m128i b, int imm8)

Synopsis

__m256i _mm256_mask_inserti64x2 (__m256i src, __mmask8 k, __m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[255:0] := a[255:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] ESAC FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vinserti64x2
__m256i _mm256_maskz_inserti64x2 (__mmask8 k, __m256i a, __m128i b, int imm8)

Synopsis

__m256i _mm256_maskz_inserti64x2 (__mmask8 k, __m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x2
CPUID Flags: AVX512VL + AVX512DQ

Description

Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[255:0] := a[255:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] ESAC FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vinserti64x2
__m512i _mm512_inserti64x2 (__m512i a, __m128i b, int imm8)

Synopsis

__m512i _mm512_inserti64x2 (__m512i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x2
CPUID Flags: AVX512DQ

Description

Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the location specified by imm8.

Operation

dst[511:0] := a[511:0] CASE imm8[7:0] of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] 2: dst[383:256] := b[127:0] 3: dst[511:384] := b[127:0] ESAC dst[MAX:512] := 0
vinserti64x2
__m512i _mm512_mask_inserti64x2 (__m512i src, __mmask8 k, __m512i a, __m128i b, int imm8)

Synopsis

__m512i _mm512_mask_inserti64x2 (__m512i src, __mmask8 k, __m512i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x2
CPUID Flags: AVX512DQ

Description

Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] 2: tmp[383:256] := b[127:0] 3: tmp[511:384] := b[127:0] ESAC FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vinserti64x2
__m512i _mm512_maskz_inserti64x2 (__mmask8 k, __m512i a, __m128i b, int imm8)

Synopsis

__m512i _mm512_maskz_inserti64x2 (__mmask8 k, __m512i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x2
CPUID Flags: AVX512DQ

Description

Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] 2: tmp[383:256] := b[127:0] 3: tmp[511:384] := b[127:0] ESAC FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vinserti64x4
__m512i _mm512_inserti64x4 (__m512i a, __m256i b, int imm8)

Synopsis

__m512i _mm512_inserti64x4 (__m512i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x4 zmm {k}, zmm, ymm, imm
CPUID Flags: AVX512F

Description

Copy a to dst, then insert 256 bits (composed of 4 packed 64-bit integers) from b into dst at the location specified by imm8.

Operation

dst[511:0] := a[511:0] CASE (imm8[7:0]) OF 0: dst[255:0] := b[255:0] 1: dst[511:256] := b[255:0] ESAC dst[MAX:512] := 0
vinserti64x4
__m512i _mm512_mask_inserti64x4 (__m512i src, __mmask8 k, __m512i a, __m256i b, int imm8)

Synopsis

__m512i _mm512_mask_inserti64x4 (__m512i src, __mmask8 k, __m512i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x4 zmm {k}, zmm, ymm, imm
CPUID Flags: AVX512F

Description

Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[0]) of 0: tmp[255:0] := b[255:0] 1: tmp[511:256] := b[255:0] ESAC FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vinserti64x4
__m512i _mm512_maskz_inserti64x4 (__mmask8 k, __m512i a, __m256i b, int imm8)

Synopsis

__m512i _mm512_maskz_inserti64x4 (__mmask8 k, __m512i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x4 zmm {k}, zmm, ymm, imm
CPUID Flags: AVX512F

Description

Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[511:0] := a[511:0] CASE (imm8[0]) of 0: tmp[255:0] := b[255:0] 1: tmp[511:256] := b[255:0] ESAC FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
kmov
__mmask16 _mm512_int2mask (int mask)

Synopsis

__mmask16 _mm512_int2mask (int mask)
#include "immintrin.h"
Instruction: kmov k, r32
CPUID Flags: KNCNI

Description

Converts integer mask into bitmask, storing the result in dst.

Operation

dst := mask[15:0]
...
__m128d _mm_invcbrt_pd (__m128d a)

Synopsis

__m128d _mm_invcbrt_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := InvCubeRoot(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_invcbrt_pd (__m256d a)

Synopsis

__m256d _mm256_invcbrt_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := InvCubeRoot(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m128 _mm_invcbrt_ps (__m128 a)

Synopsis

__m128 _mm_invcbrt_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := InvCubeRoot(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_invcbrt_ps (__m256 a)

Synopsis

__m256 _mm256_invcbrt_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := InvCubeRoot(a[i+31:i]) ENDFOR dst[MAX:256] := 0
invpcid
void _invpcid (unsigned int type, void* descriptor)

Synopsis

void _invpcid (unsigned int type, void* descriptor)
#include "immintrin.h"
Instruction: invpcid r32, m128
CPUID Flags: INVPCID

Description

Invalidate mappings in the Translation Lookaside Buffers (TLBs) and paging-structure caches for the processor context identifier (PCID) specified by descriptor based on the invalidation type specified in type. The PCID descriptor is specified as a 16-byte memory operand (with no alignment restrictions) where bits [11:0] specify the PCID, and bits [127:64] specify the linear address; bits [63:12] are reserved. The types supported are: 0) Individual-address invalidation: If type is 0, the logical processor invalidates mappings for a single linear address and tagged with the PCID specified in descriptor, except global translations. The instruction may also invalidate global translations, mappings for other linear addresses, or mappings tagged with other PCIDs. 1) Single-context invalidation: If type is 1, the logical processor invalidates all mappings tagged with the PCID specified in descriptor except global translations. In some cases, it may invalidate mappings for other PCIDs as well. 2) All-context invalidation: If type is 2, the logical processor invalidates all mappings tagged with any PCID. 3) All-context invalidation, retaining global translations: If type is 3, the logical processor invalidates all mappings tagged with any PCID except global translations, ignoring descriptor. The instruction may also invalidate global translations as well.

Operation

CASE type OF 0: // individual-address invalidation retaining global translations OP_PCID := descriptor[11:0] ADDR := descriptor[127:64] BREAK 1: // single PCID invalidation retaining globals OP_PCID := descriptor[11:0] // invalidate all mappings tagged with OP_PCID except global translations BREAK 2: // all PCID invalidation // invalidate all mappings tagged with any PCID BREAK 3: // all PCID invalidation retaining global translations // invalidate all mappings tagged with any PCID except global translations BREAK ESAC
...
__m128d _mm_invsqrt_pd (__m128d a)

Synopsis

__m128d _mm_invsqrt_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := InvSQRT(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_invsqrt_pd (__m256d a)

Synopsis

__m256d _mm256_invsqrt_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := InvSQRT(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_invsqrt_pd (__m512d a)

Synopsis

__m512d _mm512_invsqrt_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := InvSQRT(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_invsqrt_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_invsqrt_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := InvSQRT(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_invsqrt_ps (__m128 a)

Synopsis

__m128 _mm_invsqrt_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the inverse square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := InvSQRT(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_invsqrt_ps (__m256 a)

Synopsis

__m256 _mm256_invsqrt_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the inverse square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := InvSQRT(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_invsqrt_ps (__m512 a)

Synopsis

__m512 _mm512_invsqrt_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := InvSQRT(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_invsqrt_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_invsqrt_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the inverse square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := InvSQRT(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128i _mm_irem_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_irem_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed 32-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_irem_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_irem_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed 32-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:256] := 0
kandw
__mmask16 _mm512_kand (__mmask16 a, __mmask16 b)

Synopsis

__mmask16 _mm512_kand (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kandw k, k, k
CPUID Flags: AVX512F

Description

Compute the bitwise AND of 16-bit masks a and b, and store the result in k.

Operation

k[15:0] := a[15:0] AND b[15:0] k[MAX:16] := 0
kand
__mmask16 _mm512_kand (__mmask16 a, __mmask16 b)

Synopsis

__mmask16 _mm512_kand (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kand k, k
CPUID Flags: KNCNI

Description

Compute the bitwise AND of 16-bit masks a and b, and store the result in k.

Operation

k[15:0] := a[15:0] AND b[15:0] k[MAX:16] := 0
kandn
__mmask16 _mm512_kandn (__mmask16 a, __mmask16 b)

Synopsis

__mmask16 _mm512_kandn (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kandn k, k
CPUID Flags: KNCNI

Description

Compute the bitwise AND NOT of 16-bit masks a and b, and store the result in k.

Operation

k[15:0] := (NOT a[15:0]) AND b[15:0] k[MAX:16] := 0
kandnw
__mmask16 _mm512_kandn (__mmask16 a, __mmask16 b)

Synopsis

__mmask16 _mm512_kandn (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kandnw k, k, k
CPUID Flags: AVX512F

Description

Compute the bitwise AND NOT of 16-bit masks a and b, and store the result in k.

Operation

k[15:0] := (NOT a[15:0]) AND b[15:0] k[MAX:16] := 0
kandnr
__mmask16 _mm512_kandnr (__mmask16 k1, __mmask16 k2)

Synopsis

__mmask16 _mm512_kandnr (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kandnr k, k
CPUID Flags: KNCNI

Description

Performs a bitwise AND operation between NOT of k2 and k1, storing the result in dst.

Operation

dst[15:0] := NOT(k2[15:0]) & k1[15:0]
kconcath
__int64 _mm512_kconcathi_64 (__mmask16 k1, __mmask16 k2)

Synopsis

__int64 _mm512_kconcathi_64 (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kconcath r, k, k
CPUID Flags: KNCNI

Description

Packs masks k1 and k2 into the high 32 bits of dst. The rest of dst is set to 0.

Operation

dst[63:48] := k1[15:0] dst[47:32] := k2[15:0] dst[31:0] := 0
kconcatl
__int64 _mm512_kconcatlo_64 (__mmask16 k1, __mmask16 k2)

Synopsis

__int64 _mm512_kconcatlo_64 (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kconcatl r, k, k
CPUID Flags: KNCNI

Description

Packs masks k1 and k2 into the low 32 bits of dst. The rest of dst is set to 0.

Operation

dst[31:16] := k1[15:0] dst[15:0] := k2[15:0] dst[63:32] := 0
kextract
__mmask16 _mm512_kextract_64 (__int64 a, const int b)

Synopsis

__mmask16 _mm512_kextract_64 (__int64 a, const int b)
#include "immintrin.h"
Instruction: kextract k, r, imm
CPUID Flags: KNCNI

Description

Extracts 16-bit value b from 64-bit integer a, storing the result in dst.

Operation

CASE b of 0: dst[15:0] := a[63:48] 1: dst[15:0] := a[47:32] 2: dst[15:0] := a[31:16] 3: dst[15:0] := a[15:0] ESAC dst[MAX:15] := 0
kmerge2l1h
__mmask16 _mm512_kmerge2l1h (__mmask16 k1, __mmask16 k2)

Synopsis

__mmask16 _mm512_kmerge2l1h (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kmerge2l1h k, k
CPUID Flags: KNCNI

Description

Move the high element from k1 to the low element of k1, and insert the low element of k2 into the high element of k1.

Operation

tmp[7:0] := k1[15:8] k1[15:8] := k2[7:0] k1[7:0] := tmp[7:0]
kmerge2l1l
__mmask16 _mm512_kmerge2l1l (__mmask16 k1, __mmask16 k2)

Synopsis

__mmask16 _mm512_kmerge2l1l (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kmerge2l1l k, k
CPUID Flags: KNCNI

Description

Insert the low element of k2 into the high element of k1.

Operation

k1[15:8] := k2[7:0]
kmovw
__mmask16 _mm512_kmov (__mmask16 a)

Synopsis

__mmask16 _mm512_kmov (__mmask16 a)
#include "immintrin.h"
Instruction: kmovw k, k
CPUID Flags: AVX512F

Description

Copy 16-bit mask a to k.

Operation

k[15:0] := a[15:0] k[MAX:16] := 0
kmov
__mmask16 _mm512_kmov (__mmask16 a)

Synopsis

__mmask16 _mm512_kmov (__mmask16 a)
#include "immintrin.h"
Instruction: kmov k, k
CPUID Flags: KNCNI

Description

Copy 16-bit mask a to k.

Operation

k[15:0] := a[15:0] k[MAX:16] := 0
kmerge2l1l
__mmask16 _mm512_kmovlhb (__mmask16 k1, __mmask16 k2)

Synopsis

__mmask16 _mm512_kmovlhb (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kmerge2l1l k, k
CPUID Flags: KNCNI

Description

Inserts the low byte of mask k2 into the high byte of dst, and copies the low byte of k1 to the low byte of dst.

Operation

dst[7:0] := k1[7:0] dst[15:8] := k2[7:0]
knot
__mmask16 _mm512_knot (__mmask16 a)

Synopsis

__mmask16 _mm512_knot (__mmask16 a)
#include "immintrin.h"
Instruction: knot k, k
CPUID Flags: KNCNI

Description

Compute the bitwise NOT of 16-bit mask a, and store the result in k.

Operation

k[15:0] := NOT a[15:0] k[MAX:16] := 0
knotw
__mmask16 _mm512_knot (__mmask16 a)

Synopsis

__mmask16 _mm512_knot (__mmask16 a)
#include "immintrin.h"
Instruction: knotw k, k
CPUID Flags: AVX512F

Description

Compute the bitwise NOT of 16-bit mask a, and store the result in k.

Operation

k[15:0] := NOT a[15:0] k[MAX:16] := 0
kor
__mmask16 _mm512_kor (__mmask16 a, __mmask16 b)

Synopsis

__mmask16 _mm512_kor (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kor k, k
CPUID Flags: KNCNI

Description

Compute the bitwise OR of 16-bit masks a and b, and store the result in k.

Operation

k[15:0] := a[15:0] OR b[15:0] k[MAX:16] := 0
korw
__mmask16 _mm512_kor (__mmask16 a, __mmask16 b)

Synopsis

__mmask16 _mm512_kor (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: korw k, k, k
CPUID Flags: AVX512F

Description

Compute the bitwise OR of 16-bit masks a and b, and store the result in k.

Operation

k[15:0] := a[15:0] OR b[15:0] k[MAX:16] := 0
kortest
int _mm512_kortestc (__mmask16 k1, __mmask16 k2)

Synopsis

int _mm512_kortestc (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kortest k, k
CPUID Flags: KNCNI

Description

Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.

Operation

dst[15:0] := k1[15:0] | k2[15:0] IF PopCount(dst[15:0]) = 16 SetCF() FI
kortestw
int _mm512_kortestc (__mmask16 k1, __mmask16 k2)

Synopsis

int _mm512_kortestc (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kortestw k, k
CPUID Flags: AVX512F

Description

Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.

Operation

dst[15:0] := k1[15:0] | k2[15:0] IF PopCount(dst[15:0]) = 16 SetCF() FI
kortestw
int _mm512_kortestz (__mmask16 k1, __mmask16 k2)

Synopsis

int _mm512_kortestz (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kortestw k, k
CPUID Flags: AVX512F

Description

Performs bitwise OR between k1 and k2, storing the result in dst. ZF flag is set if dst is 0.

Operation

dst[15:0] := k1[15:0] | k2[15:0] IF dst = 0 SetZF() FI
kortest
int _mm512_kortestz (__mmask16 k1, __mmask16 k2)

Synopsis

int _mm512_kortestz (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kortest k, k
CPUID Flags: KNCNI

Description

Performs bitwise OR between k1 and k2, storing the result in dst. ZF flag is set if dst is 0.

Operation

dst[15:0] := k1[15:0] | k2[15:0] IF dst = 0 SetZF() FI
kmerge2l1h
__mmask16 _mm512_kswapb (__mmask16 k1, __mmask16 k2)

Synopsis

__mmask16 _mm512_kswapb (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kmerge2l1h k, k
CPUID Flags: KNCNI

Description

Moves high byte from k2 to low byte of k1, and moves low byte of k2 to high byte of k1.

Operation

tmp[7:0] := k2[15:8] k2[15:8] := k1[7:0] k1[7:0] := tmp[7:0] tmp[7:0] := k2[7:0] k2[7:0] := k1[15:8] k1[15:8] := tmp[7:0]
kunpckbw
__mmask16 _mm512_kunpackb (__mmask16 a, __mmask16 b)

Synopsis

__mmask16 _mm512_kunpackb (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kunpckbw k, k, k
CPUID Flags: AVX512F

Description

Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k.

Operation

k[7:0] := b[7:0] k[15:8] := a[7:0] k[MAX:16] := 0
kunpckdq
__mmask64 _mm512_kunpackd (__mmask64 a, __mmask64 b)

Synopsis

__mmask64 _mm512_kunpackd (__mmask64 a, __mmask64 b)
#include "immintrin.h"
Instruction: kunpckdq
CPUID Flags: AVX512BW

Description

Unpack and interleave 32 bits from masks a and b, and store the 64-bit result in k.

Operation

k[31:0] := a[31:0] k[63:32] := b[31:0] k[MAX:64] := 0
kunpckwd
__mmask32 _mm512_kunpackw (__mmask32 a, __mmask32 b)

Synopsis

__mmask32 _mm512_kunpackw (__mmask32 a, __mmask32 b)
#include "immintrin.h"
Instruction: kunpckwd
CPUID Flags: AVX512BW

Description

Unpack and interleave 16 bits from masks a and b, and store the 32-bit result in k.

Operation

k[15:0] := a[15:0] k[31:16] := b[15:0] k[MAX:32] := 0
kxnorw
__mmask16 _mm512_kxnor (__mmask16 a, __mmask16 b)

Synopsis

__mmask16 _mm512_kxnor (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kxnorw k, k, k
CPUID Flags: AVX512F

Description

Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.

Operation

k[15:0] := NOT (a[15:0] XOR b[15:0]) k[MAX:16] := 0
kxnor
__mmask16 _mm512_kxnor (__mmask16 a, __mmask16 b)

Synopsis

__mmask16 _mm512_kxnor (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kxnor k, k
CPUID Flags: KNCNI

Description

Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.

Operation

k[15:0] := NOT (a[15:0] XOR b[15:0]) k[MAX:16] := 0
kxor
__mmask16 _mm512_kxor (__mmask16 a, __mmask16 b)

Synopsis

__mmask16 _mm512_kxor (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kxor k, k
CPUID Flags: KNCNI

Description

Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.

Operation

k[15:0] := a[15:0] XOR b[15:0] k[MAX:16] := 0
kxorw
__mmask16 _mm512_kxor (__mmask16 a, __mmask16 b)

Synopsis

__mmask16 _mm512_kxor (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kxorw k, k, k
CPUID Flags: AVX512F

Description

Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.

Operation

k[15:0] := a[15:0] XOR b[15:0] k[MAX:16] := 0
lddqu
__m128i _mm_lddqu_si128 (__m128i const* mem_addr)

Synopsis

__m128i _mm_lddqu_si128 (__m128i const* mem_addr)
#include "pmmintrin.h"
Instruction: lddqu xmm, m128
CPUID Flags: SSE3

Description

Load 128-bits of integer data from unaligned memory into dst. This intrinsic may perform better than _mm_loadu_si128 when the data crosses a cache line boundary.

Operation

dst[127:0] := MEM[mem_addr+127:mem_addr]
vlddqu
__m256i _mm256_lddqu_si256 (__m256i const * mem_addr)

Synopsis

__m256i _mm256_lddqu_si256 (__m256i const * mem_addr)
#include "immintrin.h"
Instruction: vlddqu ymm, m256
CPUID Flags: AVX

Description

Load 256-bits of integer data from unaligned memory into dst. This intrinsic may perform better than _mm256_loadu_si256 when the data crosses a cache line boundary.

Operation

dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0
lfence
void _mm_lfence (void)

Synopsis

void _mm_lfence (void)
#include "emmintrin.h"
Instruction: lfence
CPUID Flags: SSE2

Description

Perform a serializing operation on all load-from-memory instructions that were issued prior to this instruction. Guarantees that every load instruction that precedes, in program order, is globally visible before any load instruction which follows the fence in program order.

Performance

ArchitectureLatencyThroughput
Westmere2-
Nehalem2-
vmovdqa32
__m128i _mm_mask_load_epi32 (__m128i src, __mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_mask_load_epi32 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 32-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vmovdqa32
__m128i _mm_maskz_load_epi32 (__mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_maskz_load_epi32 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 32-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovdqa32
__m256i _mm256_mask_load_epi32 (__m256i src, __mmask8 k, void const* mem_addr)

Synopsis

__m256i _mm256_mask_load_epi32 (__m256i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 32-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vmovdqa32
__m256i _mm256_maskz_load_epi32 (__mmask8 k, void const* mem_addr)

Synopsis

__m256i _mm256_maskz_load_epi32 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 32-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovdqa32
__m512i _mm512_load_epi32 (void const* mem_addr)

Synopsis

__m512i _mm512_load_epi32 (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0
vmovdqa32
__m512i _mm512_mask_load_epi32 (__m512i src, __mmask16 k, void const* mem_addr)

Synopsis

__m512i _mm512_mask_load_epi32 (__m512i src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Load packed 32-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmovdqa32
__m512i _mm512_maskz_load_epi32 (__mmask16 k, void const* mem_addr)

Synopsis

__m512i _mm512_maskz_load_epi32 (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, m512
CPUID Flags: AVX512F

Description

Load packed 32-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmovdqa64
__m128i _mm_mask_load_epi64 (__m128i src, __mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_mask_load_epi64 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 64-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vmovdqa64
__m128i _mm_maskz_load_epi64 (__mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_maskz_load_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 64-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovdqa64
__m256i _mm256_mask_load_epi64 (__m256i src, __mmask8 k, void const* mem_addr)

Synopsis

__m256i _mm256_mask_load_epi64 (__m256i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 64-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vmovdqa64
__m256i _mm256_maskz_load_epi64 (__mmask8 k, void const* mem_addr)

Synopsis

__m256i _mm256_maskz_load_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 64-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovdqa64
__m512i _mm512_load_epi64 (void const* mem_addr)

Synopsis

__m512i _mm512_load_epi64 (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64 zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0
vmovdqa64
__m512i _mm512_mask_load_epi64 (__m512i src, __mmask8 k, void const* mem_addr)

Synopsis

__m512i _mm512_mask_load_epi64 (__m512i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64 zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Load packed 64-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmovdqa64
__m512i _mm512_maskz_load_epi64 (__mmask8 k, void const* mem_addr)

Synopsis

__m512i _mm512_maskz_load_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64 zmm {k}, m512
CPUID Flags: AVX512F

Description

Load packed 64-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
movapd
__m128d _mm_load_pd (double const* mem_addr)

Synopsis

__m128d _mm_load_pd (double const* mem_addr)
#include "emmintrin.h"
Instruction: movapd xmm, m128
CPUID Flags: SSE2

Description

Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

dst[127:0] := MEM[mem_addr+127:mem_addr]
vmovapd
__m128d _mm_mask_load_pd (__m128d src, __mmask8 k, void const* mem_addr)

Synopsis

__m128d _mm_mask_load_pd (__m128d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F

Description

Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vmovapd
__m128d _mm_maskz_load_pd (__mmask8 k, void const* mem_addr)

Synopsis

__m128d _mm_maskz_load_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F

Description

Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovapd
__m256d _mm256_load_pd (double const * mem_addr)

Synopsis

__m256d _mm256_load_pd (double const * mem_addr)
#include "immintrin.h"
Instruction: vmovapd ymm, m256
CPUID Flags: AVX

Description

Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0
vmovapd
__m256d _mm256_mask_load_pd (__m256d src, __mmask8 k, void const* mem_addr)

Synopsis

__m256d _mm256_mask_load_pd (__m256d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F

Description

Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vmovapd
__m256d _mm256_maskz_load_pd (__mmask8 k, void const* mem_addr)

Synopsis

__m256d _mm256_maskz_load_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F

Description

Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovapd
__m512d _mm512_load_pd (void const* mem_addr)

Synopsis

__m512d _mm512_load_pd (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0
vmovapd
__m512d _mm512_mask_load_pd (__m512d src, __mmask8 k, void const* mem_addr)

Synopsis

__m512d _mm512_mask_load_pd (__m512d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmovapd
__m512d _mm512_maskz_load_pd (__mmask8 k, void const* mem_addr)

Synopsis

__m512d _mm512_maskz_load_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, m512
CPUID Flags: AVX512F

Description

Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_load_pd1 (double const* mem_addr)

Synopsis

__m128d _mm_load_pd1 (double const* mem_addr)
#include "emmintrin.h"
Instruction: movapd xmm, m128
CPUID Flags: SSE2

Description

Load a double-precision (64-bit) floating-point element from memory into both elements of dst.

Operation

dst[63:0] := MEM[mem_addr+63:mem_addr] dst[127:64] := MEM[mem_addr+63:mem_addr]
movaps
__m128 _mm_load_ps (float const* mem_addr)

Synopsis

__m128 _mm_load_ps (float const* mem_addr)
#include "xmmintrin.h"
Instruction: movaps xmm, m128
CPUID Flags: SSE

Description

Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

dst[127:0] := MEM[mem_addr+127:mem_addr]
vmovaps
__m128 _mm_mask_load_ps (__m128 src, __mmask8 k, void const* mem_addr)

Synopsis

__m128 _mm_mask_load_ps (__m128 src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vmovaps
__m128 _mm_maskz_load_ps (__mmask8 k, void const* mem_addr)

Synopsis

__m128 _mm_maskz_load_ps (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovaps
__m256 _mm256_load_ps (float const * mem_addr)

Synopsis

__m256 _mm256_load_ps (float const * mem_addr)
#include "immintrin.h"
Instruction: vmovaps ymm, m256
CPUID Flags: AVX

Description

Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0
vmovaps
__m256 _mm256_mask_load_ps (__m256 src, __mmask8 k, void const* mem_addr)

Synopsis

__m256 _mm256_mask_load_ps (__m256 src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vmovaps
__m256 _mm256_maskz_load_ps (__mmask8 k, void const* mem_addr)

Synopsis

__m256 _mm256_maskz_load_ps (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovaps
__m512 _mm512_load_ps (void const* mem_addr)

Synopsis

__m512 _mm512_load_ps (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0
vmovaps
__m512 _mm512_mask_load_ps (__m512 src, __mmask16 k, void const* mem_addr)

Synopsis

__m512 _mm512_mask_load_ps (__m512 src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmovaps
__m512 _mm512_maskz_load_ps (__mmask16 k, void const* mem_addr)

Synopsis

__m512 _mm512_maskz_load_ps (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, m512
CPUID Flags: AVX512F

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_load_ps1 (float const* mem_addr)

Synopsis

__m128 _mm_load_ps1 (float const* mem_addr)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Load a single-precision (32-bit) floating-point element from memory into all elements of dst.

Operation

dst[31:0] := MEM[mem_addr+31:mem_addr] dst[63:32] := MEM[mem_addr+31:mem_addr] dst[95:64] := MEM[mem_addr+31:mem_addr] dst[127:96] := MEM[mem_addr+31:mem_addr]
movsd
__m128d _mm_load_sd (double const* mem_addr)

Synopsis

__m128d _mm_load_sd (double const* mem_addr)
#include "emmintrin.h"
Instruction: movsd xmm, m64
CPUID Flags: SSE2

Description

Load a double-precision (64-bit) floating-point element from memory into the lower of dst, and zero the upper element. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[63:0] := MEM[mem_addr+63:mem_addr] dst[127:64] := 0
vmovsd
__m128d _mm_mask_load_sd (__m128d src, __mmask8 k, const double* mem_addr)

Synopsis

__m128d _mm_mask_load_sd (__m128d src, __mmask8 k, const double* mem_addr)
#include "immintrin.h"
Instruction: vmovsd xmm {k}, m64
CPUID Flags: AVX512F

Description

Load a double-precision (64-bit) floating-point element from memory into the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper element of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

IF k[0] dst[63:0] := MEM[mem_addr+63:mem_addr] ELSE dst[63:0] := src[63:0] FI dst[MAX:64] := 0
vmovsd
__m128d _mm_maskz_load_sd (__mmask8 k, const double* mem_addr)

Synopsis

__m128d _mm_maskz_load_sd (__mmask8 k, const double* mem_addr)
#include "immintrin.h"
Instruction: vmovsd xmm {k}, m64
CPUID Flags: AVX512F

Description

Load a double-precision (64-bit) floating-point element from memory into the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper element of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

IF k[0] dst[63:0] := MEM[mem_addr+63:mem_addr] ELSE dst[63:0] := 0 FI dst[MAX:64] := 0
movdqa
__m128i _mm_load_si128 (__m128i const* mem_addr)

Synopsis

__m128i _mm_load_si128 (__m128i const* mem_addr)
#include "emmintrin.h"
Instruction: movdqa xmm, m128
CPUID Flags: SSE2

Description

Load 128-bits of integer data from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

dst[127:0] := MEM[mem_addr+127:mem_addr]
vmovdqa
__m256i _mm256_load_si256 (__m256i const * mem_addr)

Synopsis

__m256i _mm256_load_si256 (__m256i const * mem_addr)
#include "immintrin.h"
Instruction: vmovdqa ymm, m256
CPUID Flags: AVX

Description

Load 256-bits of integer data from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0
vmovdqa32
__m512i _mm512_load_si512 (void const* mem_addr)

Synopsis

__m512i _mm512_load_si512 (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Load 512-bits of integer data from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0
movss
__m128 _mm_load_ss (float const* mem_addr)

Synopsis

__m128 _mm_load_ss (float const* mem_addr)
#include "xmmintrin.h"
Instruction: movss xmm, m32
CPUID Flags: SSE

Description

Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 elements. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[31:0] := MEM[mem_addr+31:mem_addr] dst[127:32] := 0
vmovss
__m128 _mm_mask_load_ss (__m128 src, __mmask8 k, const float* mem_addr)

Synopsis

__m128 _mm_mask_load_ss (__m128 src, __mmask8 k, const float* mem_addr)
#include "immintrin.h"
Instruction: vmovss xmm {k}, m32
CPUID Flags: AVX512F

Description

Load a single-precision (32-bit) floating-point element from memory into the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

IF k[0] dst[31:0] := MEM[mem_addr+31:mem_addr] ELSE dst[31:0] := src[31:0] FI dst[MAX:32] := 0
vmovss
__m128 _mm_maskz_load_ss (__mmask8 k, const float* mem_addr)

Synopsis

__m128 _mm_maskz_load_ss (__mmask8 k, const float* mem_addr)
#include "immintrin.h"
Instruction: vmovss xmm {k}, m32
CPUID Flags: AVX512F

Description

Load a single-precision (32-bit) floating-point element from memory into the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

IF k[0] dst[31:0] := MEM[mem_addr+31:mem_addr] ELSE dst[31:0] := 0 FI dst[MAX:32] := 0
...
__m128d _mm_load1_pd (double const* mem_addr)

Synopsis

__m128d _mm_load1_pd (double const* mem_addr)
#include "emmintrin.h"
Instruction: movapd xmm, m128
CPUID Flags: SSE2

Description

Load a double-precision (64-bit) floating-point element from memory into both elements of dst.

Operation

dst[63:0] := MEM[mem_addr+63:mem_addr] dst[127:64] := MEM[mem_addr+63:mem_addr]
...
__m128 _mm_load1_ps (float const* mem_addr)

Synopsis

__m128 _mm_load1_ps (float const* mem_addr)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Load a single-precision (32-bit) floating-point element from memory into all elements of dst.

Operation

dst[31:0] := MEM[mem_addr+31:mem_addr] dst[63:32] := MEM[mem_addr+31:mem_addr] dst[95:64] := MEM[mem_addr+31:mem_addr] dst[127:96] := MEM[mem_addr+31:mem_addr]
...
short _loadbe_i16 (void const * ptr)

Synopsis

short _loadbe_i16 (void const * ptr)
#include "immintrin.h"

Description

Loads a big-endian word (16-bit) value from address ptr and stores the result in dst.

Operation

addr := MEM[ptr] FOR j := 0 to 1 i := j*8 dst[i+7:i] := addr[15-i:15-i-7] ENDFOR
...
int _loadbe_i32 (void const * ptr)

Synopsis

int _loadbe_i32 (void const * ptr)
#include "immintrin.h"

Description

Loads a big-endian double word (32-bit) value from address ptr and stores the result in dst.

Operation

addr := MEM[ptr] FOR j := 0 to 4 i := j*8 dst[i+7:i] := addr[31-i:31-i-7] ENDFOR
...
__int64 _loadbe_i64 (void const * ptr)

Synopsis

__int64 _loadbe_i64 (void const * ptr)
#include "immintrin.h"

Description

Loads a big-endian quad word (64-bit) value from address ptr and stores the result in dst.

Operation

addr := MEM[ptr] FOR j := 0 to 8 i := j*8 dst[i+7:i] := addr[63-i:63-i-7] ENDFOR
movddup
__m128d _mm_loaddup_pd (double const* mem_addr)

Synopsis

__m128d _mm_loaddup_pd (double const* mem_addr)
#include "pmmintrin.h"
Instruction: movddup xmm, m64
CPUID Flags: SSE3

Description

Load a double-precision (64-bit) floating-point element from memory into both elements of dst.

Operation

tmp[63:0] := MEM[mem_addr+63:mem_addr] tmp[127:64] := MEM[mem_addr+63:mem_addr]
movhpd
__m128d _mm_loadh_pd (__m128d a, double const* mem_addr)

Synopsis

__m128d _mm_loadh_pd (__m128d a, double const* mem_addr)
#include "emmintrin.h"
Instruction: movhpd xmm, m64
CPUID Flags: SSE2

Description

Load a double-precision (64-bit) floating-point element from memory into the upper element of dst, and copy the lower element from a to dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[63:0] := a[63:0] dst[127:64] := MEM[mem_addr+63:mem_addr]

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
movhps
__m128 _mm_loadh_pi (__m128 a, __m64 const* mem_addr)

Synopsis

__m128 _mm_loadh_pi (__m128 a, __m64 const* mem_addr)
#include "xmmintrin.h"
Instruction: movhps xmm, m64
CPUID Flags: SSE

Description

Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of dst, and copy the lower 2 elements from a to dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[31:0] := a[31:0] dst[63:32] := a[63:32] dst[95:64] := MEM[mem_addr+31:mem_addr] dst[127:96] := MEM[mem_addr+63:mem_addr+32]

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
movq
__m128i _mm_loadl_epi64 (__m128i const* mem_addr)

Synopsis

__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
#include "emmintrin.h"
Instruction: movq xmm, m64
CPUID Flags: SSE2

Description

Load 64-bit integer from memory into the first element of dst.

Operation

dst[63:0] := MEM[mem_addr+63:mem_addr] dst[MAX:64] := 0
movlpd
__m128d _mm_loadl_pd (__m128d a, double const* mem_addr)

Synopsis

__m128d _mm_loadl_pd (__m128d a, double const* mem_addr)
#include "emmintrin.h"
Instruction: movlpd xmm, m64
CPUID Flags: SSE2

Description

Load a double-precision (64-bit) floating-point element from memory into the lower element of dst, and copy the upper element from a to dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[63:0] := MEM[mem_addr+63:mem_addr] dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
movlps
__m128 _mm_loadl_pi (__m128 a, __m64 const* mem_addr)

Synopsis

__m128 _mm_loadl_pi (__m128 a, __m64 const* mem_addr)
#include "xmmintrin.h"
Instruction: movlps xmm, m64
CPUID Flags: SSE

Description

Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of dst, and copy the upper 2 elements from a to dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[31:0] := MEM[mem_addr+31:mem_addr] dst[63:32] := MEM[mem_addr+63:mem_addr+32] dst[95:64] := a[95:64] dst[127:96] := a[127:96]

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
...
__m128d _mm_loadr_pd (double const* mem_addr)

Synopsis

__m128d _mm_loadr_pd (double const* mem_addr)
#include "emmintrin.h"
Instruction: movapd xmm, m128
CPUID Flags: SSE2

Description

Load 2 double-precision (64-bit) floating-point elements from memory into dst in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

dst[63:0] := MEM[mem_addr+127:mem_addr+64] dst[127:64] := MEM[mem_addr+63:mem_addr]
...
__m128 _mm_loadr_ps (float const* mem_addr)

Synopsis

__m128 _mm_loadr_ps (float const* mem_addr)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Load 4 single-precision (32-bit) floating-point elements from memory into dst in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

dst[31:0] := MEM[mem_addr+127:mem_addr+96] dst[63:32] := MEM[mem_addr+95:mem_addr+64] dst[95:64] := MEM[mem_addr+63:mem_addr+32] dst[127:96] := MEM[mem_addr+31:mem_addr]
vmovdqu16
__m128i _mm_mask_loadu_epi16 (__m128i src, __mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_mask_loadu_epi16 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW

Description

Load packed 16-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vmovdqu16
__m128i _mm_maskz_loadu_epi16 (__mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_maskz_loadu_epi16 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW

Description

Load packed 16-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovdqu16
__m256i _mm256_mask_loadu_epi16 (__m256i src, __mmask16 k, void const* mem_addr)

Synopsis

__m256i _mm256_mask_loadu_epi16 (__m256i src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW

Description

Load packed 16-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vmovdqu16
__m256i _mm256_maskz_loadu_epi16 (__mmask16 k, void const* mem_addr)

Synopsis

__m256i _mm256_maskz_loadu_epi16 (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW

Description

Load packed 16-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovdqu16
__m512i _mm512_mask_loadu_epi16 (__m512i src, __mmask32 k, void const* mem_addr)

Synopsis

__m512i _mm512_mask_loadu_epi16 (__m512i src, __mmask32 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512BW

Description

Load packed 16-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vmovdqu16
__m512i _mm512_maskz_loadu_epi16 (__mmask32 k, void const* mem_addr)

Synopsis

__m512i _mm512_maskz_loadu_epi16 (__mmask32 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512BW

Description

Load packed 16-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmovdqu32
__m128i _mm_mask_loadu_epi32 (__m128i src, __mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_mask_loadu_epi32 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 32-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vmovdqu32
__m128i _mm_maskz_loadu_epi32 (__mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_maskz_loadu_epi32 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 32-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovdqu32
__m256i _mm256_mask_loadu_epi32 (__m256i src, __mmask8 k, void const* mem_addr)

Synopsis

__m256i _mm256_mask_loadu_epi32 (__m256i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 32-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vmovdqu32
__m256i _mm256_maskz_loadu_epi32 (__mmask8 k, void const* mem_addr)

Synopsis

__m256i _mm256_maskz_loadu_epi32 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 32-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovdqu32
__m512i _mm512_mask_loadu_epi32 (__m512i src, __mmask16 k, void const* mem_addr)

Synopsis

__m512i _mm512_mask_loadu_epi32 (__m512i src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32 zmm {k}, m512
CPUID Flags: AVX512F

Description

Load packed 32-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmovdqu32
__m512i _mm512_maskz_loadu_epi32 (__mmask16 k, void const* mem_addr)

Synopsis

__m512i _mm512_maskz_loadu_epi32 (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32 zmm {k}, m512
CPUID Flags: AVX512F

Description

Load packed 32-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmovdqu64
__m128i _mm_mask_loadu_epi64 (__m128i src, __mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_mask_loadu_epi64 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu64
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 64-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vmovdqu64
__m128i _mm_maskz_loadu_epi64 (__mmask8 k, void const* mem_addr)

Synopsis

__m128i _mm_maskz_loadu_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu64
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 64-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovdqu64
__m256i _mm256_mask_loadu_epi64 (__m256i src, __mmask8 k, void const* mem_addr)

Synopsis

__m256i _mm256_mask_loadu_epi64 (__m256i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu64
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 64-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vmovdqu64
__m256i _mm256_maskz_loadu_epi64 (__mmask8 k, void const* mem_addr)

Synopsis

__m256i _mm256_maskz_loadu_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu64
CPUID Flags: AVX512VL + AVX512F

Description

Load packed 64-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovdqu64
__m512i _mm512_mask_loadu_epi64 (__m512i src, __mmask8 k, void const* mem_addr)

Synopsis

__m512i _mm512_mask_loadu_epi64 (__m512i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu64 zmm {k}, m512
CPUID Flags: AVX512F

Description

Load packed 64-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmovdqu64
__m512i _mm512_maskz_loadu_epi64 (__mmask8 k, void const* mem_addr)

Synopsis

__m512i _mm512_maskz_loadu_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu64 zmm {k}, m512
CPUID Flags: AVX512F

Description

Load packed 64-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmovdqu8
__m128i _mm_mask_loadu_epi8 (__m128i src, __mmask16 k, void const* mem_addr)

Synopsis

__m128i _mm_mask_loadu_epi8 (__m128i src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW

Description

Load packed 8-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vmovdqu8
__m128i _mm_maskz_loadu_epi8 (__mmask16 k, void const* mem_addr)

Synopsis

__m128i _mm_maskz_loadu_epi8 (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW

Description

Load packed 8-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovdqu8
__m256i _mm256_mask_loadu_epi8 (__m256i src, __mmask32 k, void const* mem_addr)

Synopsis

__m256i _mm256_mask_loadu_epi8 (__m256i src, __mmask32 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW

Description

Load packed 8-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vmovdqu8
__m256i _mm256_maskz_loadu_epi8 (__mmask32 k, void const* mem_addr)

Synopsis

__m256i _mm256_maskz_loadu_epi8 (__mmask32 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW

Description

Load packed 8-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovdqu8
__m512i _mm512_mask_loadu_epi8 (__m512i src, __mmask64 k, void const* mem_addr)

Synopsis

__m512i _mm512_mask_loadu_epi8 (__m512i src, __mmask64 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512BW

Description

Load packed 8-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vmovdqu8
__m512i _mm512_maskz_loadu_epi8 (__mmask64 k, void const* mem_addr)

Synopsis

__m512i _mm512_maskz_loadu_epi8 (__mmask64 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512BW

Description

Load packed 8-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
movupd
__m128d _mm_loadu_pd (double const* mem_addr)

Synopsis

__m128d _mm_loadu_pd (double const* mem_addr)
#include "emmintrin.h"
Instruction: movupd xmm, m128
CPUID Flags: SSE2

Description

Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[127:0] := MEM[mem_addr+127:mem_addr]
vmovupd
__m128d _mm_mask_loadu_pd (__m128d src, __mmask8 k, void const* mem_addr)

Synopsis

__m128d _mm_mask_loadu_pd (__m128d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd
CPUID Flags: AVX512VL + AVX512F

Description

Load packed double-precision (64-bit) floating-point elements from memoy into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vmovupd
__m128d _mm_maskz_loadu_pd (__mmask8 k, void const* mem_addr)

Synopsis

__m128d _mm_maskz_loadu_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd
CPUID Flags: AVX512VL + AVX512F

Description

Load packed double-precision (64-bit) floating-point elements from memoy into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovupd
__m256d _mm256_loadu_pd (double const * mem_addr)

Synopsis

__m256d _mm256_loadu_pd (double const * mem_addr)
#include "immintrin.h"
Instruction: vmovupd ymm, m256
CPUID Flags: AVX

Description

Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0
vmovupd
__m256d _mm256_mask_loadu_pd (__m256d src, __mmask8 k, void const* mem_addr)

Synopsis

__m256d _mm256_mask_loadu_pd (__m256d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd
CPUID Flags: AVX512VL + AVX512F

Description

Load packed double-precision (64-bit) floating-point elements from memoy into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vmovupd
__m256d _mm256_maskz_loadu_pd (__mmask8 k, void const* mem_addr)

Synopsis

__m256d _mm256_maskz_loadu_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd
CPUID Flags: AVX512VL + AVX512F

Description

Load packed double-precision (64-bit) floating-point elements from memoy into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovupd
__m512d _mm512_loadu_pd (void const* mem_addr)

Synopsis

__m512d _mm512_loadu_pd (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd zmm {k}, m512
CPUID Flags: AVX512F

Description

Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0
vmovupd
__m512d _mm512_mask_loadu_pd (__m512d src, __mmask8 k, void const* mem_addr)

Synopsis

__m512d _mm512_mask_loadu_pd (__m512d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd zmm {k}, m512
CPUID Flags: AVX512F

Description

Load packed double-precision (64-bit) floating-point elements from memoy into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmovupd
__m512d _mm512_maskz_loadu_pd (__mmask8 k, void const* mem_addr)

Synopsis

__m512d _mm512_maskz_loadu_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd zmm {k}, m512
CPUID Flags: AVX512F

Description

Load packed double-precision (64-bit) floating-point elements from memoy into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
movups
__m128 _mm_loadu_ps (float const* mem_addr)

Synopsis

__m128 _mm_loadu_ps (float const* mem_addr)
#include "xmmintrin.h"
Instruction: movups xmm, m128
CPUID Flags: SSE

Description

Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[127:0] := MEM[mem_addr+127:mem_addr]
vmovups
__m128 _mm_mask_loadu_ps (__m128 src, __mmask8 k, void const* mem_addr)

Synopsis

__m128 _mm_mask_loadu_ps (__m128 src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups
CPUID Flags: AVX512VL + AVX512F

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vmovups
__m128 _mm_maskz_loadu_ps (__mmask8 k, void const* mem_addr)

Synopsis

__m128 _mm_maskz_loadu_ps (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups
CPUID Flags: AVX512VL + AVX512F

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovups
__m256 _mm256_loadu_ps (float const * mem_addr)

Synopsis

__m256 _mm256_loadu_ps (float const * mem_addr)
#include "immintrin.h"
Instruction: vmovups ymm, m256
CPUID Flags: AVX

Description

Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0
vmovups
__m256 _mm256_mask_loadu_ps (__m256 src, __mmask8 k, void const* mem_addr)

Synopsis

__m256 _mm256_mask_loadu_ps (__m256 src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups
CPUID Flags: AVX512VL + AVX512F

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vmovups
__m256 _mm256_maskz_loadu_ps (__mmask8 k, void const* mem_addr)

Synopsis

__m256 _mm256_maskz_loadu_ps (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups
CPUID Flags: AVX512VL + AVX512F

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovups
__m512 _mm512_loadu_ps (void const* mem_addr)

Synopsis

__m512 _mm512_loadu_ps (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups zmm {k}, m512
CPUID Flags: AVX512F

Description

Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0
vmovups
__m512 _mm512_mask_loadu_ps (__m512 src, __mmask16 k, void const* mem_addr)

Synopsis

__m512 _mm512_mask_loadu_ps (__m512 src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups zmm {k}, m512
CPUID Flags: AVX512F

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmovups
__m512 _mm512_maskz_loadu_ps (__mmask16 k, void const* mem_addr)

Synopsis

__m512 _mm512_maskz_loadu_ps (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups zmm {k}, m512
CPUID Flags: AVX512F

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
movdqu
__m128i _mm_loadu_si128 (__m128i const* mem_addr)

Synopsis

__m128i _mm_loadu_si128 (__m128i const* mem_addr)
#include "emmintrin.h"
Instruction: movdqu xmm, m128
CPUID Flags: SSE2

Description

Load 128-bits of integer data from memory into dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[127:0] := MEM[mem_addr+127:mem_addr]
movzwl+movd
__m128i _mm_loadu_si16 (void const* mem_addr)

Synopsis

__m128i _mm_loadu_si16 (void const* mem_addr)
#include "immintrin.h"
Instruction: movzwl+movd

Description

Load unaligned 16-bit integer from memory into the first element of dst.

Operation

dst[15:0] := MEM[mem_addr+15:mem_addr] dst[MAX:16] := 0
...
__m128i _mm_loadu_si16 (void const* mem_addr)

Synopsis

__m128i _mm_loadu_si16 (void const* mem_addr)
#include "immintrin.h"
CPUID Flags: SSE

Description

Load unaligned 16-bit integer from memory into the first element of dst.

Operation

dst[15:0] := MEM[mem_addr+15:mem_addr] dst[MAX:16] := 0
vmovdqu
__m256i _mm256_loadu_si256 (__m256i const * mem_addr)

Synopsis

__m256i _mm256_loadu_si256 (__m256i const * mem_addr)
#include "immintrin.h"
Instruction: vmovdqu ymm, m256
CPUID Flags: AVX

Description

Load 256-bits of integer data from memory into dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0
movd
__m128i _mm_loadu_si32 (void const* mem_addr)

Synopsis

__m128i _mm_loadu_si32 (void const* mem_addr)
#include "immintrin.h"
Instruction: movd xmm, m32
CPUID Flags: SSE

Description

Load unaligned 32-bit integer from memory into the first element of dst.

Operation

dst[31:0] := MEM[mem_addr+31:mem_addr] dst[MAX:32] := 0
movd
__m128i _mm_loadu_si32 (void const* mem_addr)

Synopsis

__m128i _mm_loadu_si32 (void const* mem_addr)
#include "immintrin.h"
Instruction: movd

Description

Load unaligned 32-bit integer from memory into the first element of dst.

Operation

dst[31:0] := MEM[mem_addr+31:mem_addr] dst[MAX:32] := 0
vmovdqu32
__m512i _mm512_loadu_si512 (void const* mem_addr)

Synopsis

__m512i _mm512_loadu_si512 (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32 zmm {k}, m512
CPUID Flags: AVX512F

Description

Load 512-bits of integer data from memory into dst. mem_addr does not need to be aligned on any particular boundary.

Operation

dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0
movq
__m128i _mm_loadu_si64 (void const* mem_addr)

Synopsis

__m128i _mm_loadu_si64 (void const* mem_addr)
#include "immintrin.h"
Instruction: movq xmm, m64
CPUID Flags: SSE

Description

Load unaligned 64-bit integer from memory into the first element of dst.

Operation

dst[63:0] := MEM[mem_addr+63:mem_addr] dst[MAX:64] := 0
movq
__m128i _mm_loadu_si64 (void const* mem_addr)

Synopsis

__m128i _mm_loadu_si64 (void const* mem_addr)
#include "immintrin.h"
Instruction: movq

Description

Load unaligned 64-bit integer from memory into the first element of dst.

Operation

dst[63:0] := MEM[mem_addr+63:mem_addr] dst[MAX:64] := 0
...
__m256 _mm256_loadu2_m128 (float const* hiaddr, float const* loaddr)

Synopsis

__m256 _mm256_loadu2_m128 (float const* hiaddr, float const* loaddr)
#include "immintrin.h"
CPUID Flags: AVX

Description

Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value in dst. hiaddr and loaddr do not need to be aligned on any particular boundary.

Operation

dst[127:0] := MEM[loaddr+127:loaddr] dst[255:128] := MEM[hiaddr+127:hiaddr] dst[MAX:256] := 0
...
__m256d _mm256_loadu2_m128d (double const* hiaddr, double const* loaddr)

Synopsis

__m256d _mm256_loadu2_m128d (double const* hiaddr, double const* loaddr)
#include "immintrin.h"
CPUID Flags: AVX

Description

Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value in dst. hiaddr and loaddr do not need to be aligned on any particular boundary.

Operation

dst[127:0] := MEM[loaddr+127:loaddr] dst[255:128] := MEM[hiaddr+127:hiaddr] dst[MAX:256] := 0
...
__m256i _mm256_loadu2_m128i (__m128i const* hiaddr, __m128i const* loaddr)

Synopsis

__m256i _mm256_loadu2_m128i (__m128i const* hiaddr, __m128i const* loaddr)
#include "immintrin.h"
CPUID Flags: AVX

Description

Load two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value in dst. hiaddr and loaddr do not need to be aligned on any particular boundary.

Operation

dst[127:0] := MEM[loaddr+127:loaddr] dst[255:128] := MEM[hiaddr+127:hiaddr] dst[MAX:256] := 0
vloadunpackhd
__m512i _mm512_loadunpackhi_epi32 (__m512i src, void const* mt)

Synopsis

__m512i _mm512_loadunpackhi_epi32 (__m512i src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhd zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src.

Operation

dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 15 IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*4 % 64) == 0 foundNext64BytesBoundary := true FI ELSE i := j*32 tmp := MEM[addr + loadOffset*4] dst[i+31:i] := tmp[i+31:i] FI loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0
vloadunpackhd
__m512i _mm512_mask_loadunpackhi_epi32 (__m512i src, __mmask16 k, void const * mt)

Synopsis

__m512i _mm512_mask_loadunpackhi_epi32 (__m512i src, __mmask16 k, void const * mt)
#include "immintrin.h"
Instruction: vloadunpackhd zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 15 IF k[j] IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*4 % 64) == 0 foundNext64BytesBoundary := true FI ELSE i := j*32 tmp := MEM[addr + loadOffset*4] dst[i+31:i] := tmp[i+31:i] FI loadOffset := loadOffset + 1 FI ENDFOR dst[MAX:512] := 0
vloadunpackhq
__m512i _mm512_loadunpackhi_epi64 (__m512i src, void const* mt)

Synopsis

__m512i _mm512_loadunpackhi_epi64 (__m512i src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhq zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src.

Operation

dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 7 IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*8) == 0 foundNext64BytesBoundary := true FI ELSE i := j*64 tmp := MEM[addr + loadOffset*8] dst[i+63:i] := tmp[i+63:i] FI loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0
vloadunpackhq
__m512i _mm512_mask_loadunpackhi_epi64 (__m512i src, __mmask8 k, void const* mt)

Synopsis

__m512i _mm512_mask_loadunpackhi_epi64 (__m512i src, __mmask8 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhq zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 7 IF k[j] IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*8) == 0 foundNext64BytesBoundary := true FI ELSE i := j*64 tmp := MEM[addr + loadOffset*8] dst[i+63:i] := tmp[i+63:i] FI loadOffset := loadOffset + 1 FI ENDFOR dst[MAX:512] := 0
vloadunpackhpd
__m512d _mm512_loadunpackhi_pd (__m512d src, void const* mt)

Synopsis

__m512d _mm512_loadunpackhi_pd (__m512d src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhpd zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed double-precision (64-bit) floating-point values in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src.

Operation

dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 7 IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*8) % 64 == 0 foundNext64BytesBoundary := true FI ELSE i := j*64 tmp := MEM[addr + loadOffset*8] dst[i+63:i] := tmp[i+63:i] FI loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0
vloadunpackhpd
__m512d _mm512_mask_loadunpackhi_pd (__m512d src, __mmask8 k, void const* mt)

Synopsis

__m512d _mm512_mask_loadunpackhi_pd (__m512d src, __mmask8 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhpd zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed double-precision (64-bit) floating-point values in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 7 IF k[j] IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*8) % 64 == 0 foundNext64BytesBoundary := true FI ELSE i := j*64 tmp := MEM[addr + loadOffset*8] dst[i+63:i] := tmp[i+63:i] FI loadOffset := loadOffset + 1 FI ENDFOR dst[MAX:512] := 0
vloadunpackhps
__m512 _mm512_loadunpackhi_ps (__m512 src, void const* mt)

Synopsis

__m512 _mm512_loadunpackhi_ps (__m512 src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhps zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src.

Operation

dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 15 IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*4 % 64) == 0 foundNext64BytesBoundary := true FI ELSE i := j*32 tmp := MEM[addr + loadOffset*4] dst[i+31:i] := tmp[i+31:i] FI loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0
vloadunpackhps
__m512 _mm512_mask_loadunpackhi_ps (__m512 src, __mmask16 k, void const* mt)

Synopsis

__m512 _mm512_mask_loadunpackhi_ps (__m512 src, __mmask16 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhps zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the high-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt-64 and expands them into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

dst[511:0] := src[511:0] loadOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 15 IF k[j] IF foundNext64BytesBoundary == false IF (addr + (loadOffset + 1)*4 % 64) == 0 foundNext64BytesBoundary := true FI ELSE i := j*32 tmp := MEM[addr + loadOffset*4] dst[i+31:i] := tmp[i+31:i] FI loadOffset := loadOffset + 1 FI ENDFOR dst[MAX:512] := 0
vloadunpackld
__m512i _mm512_loadunpacklo_epi32 (__m512i src, void const* mt)

Synopsis

__m512i _mm512_loadunpacklo_epi32 (__m512i src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackld zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt and expanded into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src.

Operation

dst[511:0] := src[511:0] loadOffset := 0 addr = mt FOR j := 0 to 15 i := j*32 tmp := MEM[addr + loadOffset*4] dst[i+31:i] := tmp[i+31:i] loadOffset := loadOffset + 1 IF (mt + loadOffset * 4) % 64 == 0 break FI ENDFOR dst[MAX:512] := 0
vloadunpackld
__m512i _mm512_mask_loadunpacklo_epi32 (__m512i src, __mmask16 k, void const* mt)

Synopsis

__m512i _mm512_mask_loadunpacklo_epi32 (__m512i src, __mmask16 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackld zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt and expands them into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

dst[511:0] := src[511:0] loadOffset := 0 addr = mt FOR j := 0 to 15 i := j*32 IF k[j] tmp := MEM[addr + loadOffset*4] dst[i+31:i] := tmp[i+31:i] loadOffset := loadOffset + 1 IF (mt + loadOffset * 4) % 64 == 0 break FI FI ENDFOR dst[MAX:512] := 0
vloadunpacklq
__m512i _mm512_loadunpacklo_epi64 (__m512i src, void const* mt)

Synopsis

__m512i _mm512_loadunpacklo_epi64 (__m512i src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpacklq zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src.

Operation

dst[511:0] := src[511:0] loadOffset := 0 addr = mt FOR j := 0 to 7 i := j*64 tmp := MEM[addr + loadOffset*8] dst[i+63:i] := tmp[i+63:i] loadOffset := loadOffset + 1 IF (addr + loadOffset*8 % 64) == 0 break FI ENDFOR dst[MAX:512] := 0
vloadunpacklq
__m512i _mm512_mask_loadunpacklo_epi64 (__m512i src, __mmask8 k, void const* mt)

Synopsis

__m512i _mm512_mask_loadunpacklo_epi64 (__m512i src, __mmask8 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpacklq zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

dst[511:0] := src[511:0] loadOffset := 0 addr = mt FOR j := 0 to 7 i := j*64 IF k[j] tmp := MEM[addr + loadOffset*8] dst[i+63:i] := tmp[i+63:i] loadOffset := loadOffset + 1 IF (addr + loadOffset*8 % 64) == 0 break FI FI ENDFOR dst[MAX:512] := 0
vloadunpacklpd
__m512d _mm512_loadunpacklo_pd (__m512d src, void const* mt)

Synopsis

__m512d _mm512_loadunpacklo_pd (__m512d src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpacklpd zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed double-precision (64-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src.

Operation

dst[511:0] := src[511:0] loadOffset := 0 addr = mt FOR j := 0 to 7 i := j*64 tmp := MEM[addr + loadOffset*8] dst[i+63:i] := tmp[i+63:i] loadOffset := loadOffset + 1 IF ((addr + 8*loadOffset) % 64) == 0 BREAK FI ENDFOR dst[MAX:512] := 0
vloadunpacklpd
__m512d _mm512_mask_loadunpacklo_pd (__m512d src, __mmask8 k, void const* mt)

Synopsis

__m512d _mm512_mask_loadunpacklo_pd (__m512d src, __mmask8 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpacklpd zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed double-precision (64-bit) floating-point values in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

dst[511:0] := src[511:0] loadOffset := 0 addr = mt FOR j := 0 to 7 i := j*64 IF k[j] tmp := MEM[addr + loadOffset*8] dst[i+63:i] := tmp[i+63:i] loadOffset := loadOffset + 1 IF ((addr + 8*loadOffset) % 64) == 0 BREAK FI FI ENDFOR dst[MAX:512] := 0
vloadunpacklps
__m512 _mm512_loadunpacklo_ps (__m512 src, void const* mt)

Synopsis

__m512 _mm512_loadunpacklo_ps (__m512 src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpacklps zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt and expanded into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src.

Operation

dst[511:0] := src[511:0] loadOffset := 0 addr = mt FOR j := 0 to 15 i := j*32 tmp := MEM[addr + loadOffset*4] dst[i+31:i] := tmp[i+31:i] loadOffset := loadOffset + 1 IF (mt + loadOffset * 4) % 64 == 0 BREAK FI ENDFOR dst[MAX:512] := 0
vloadunpacklps
__m512 _mm512_mask_loadunpacklo_ps (__m512 src, __mmask16 k, void const* mt)

Synopsis

__m512 _mm512_mask_loadunpacklo_ps (__m512 src, __mmask16 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpacklps zmm {k}, m512
CPUID Flags: KNCNI

Description

Loads the low-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt and expanded into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

dst[511:0] := src[511:0] loadOffset := 0 addr = mt FOR j := 0 to 15 i := j*32 IF k[j] tmp := MEM[addr + loadOffset*4] dst[i+31:i] := tmp[i+31:i] loadOffset := loadOffset + 1 IF (mt + loadOffset * 4) % 64 == 0 break FI FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_log_pd (__m128d a)

Synopsis

__m128d _mm_log_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ln(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_log_pd (__m256d a)

Synopsis

__m256d _mm256_log_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ln(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_log_pd (__m512d a)

Synopsis

__m512d _mm512_log_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ln(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_log_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_log_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ln(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_log_ps (__m128 a)

Synopsis

__m128 _mm_log_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ln(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_log_ps (__m256 a)

Synopsis

__m256 _mm256_log_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ln(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_log_ps (__m512 a)

Synopsis

__m512 _mm512_log_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ln(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_log_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_log_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ln(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_log10_pd (__m128d a)

Synopsis

__m128d _mm_log10_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := log10(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_log10_pd (__m256d a)

Synopsis

__m256d _mm256_log10_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := log10(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_log10_pd (__m512d a)

Synopsis

__m512d _mm512_log10_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := log10(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_log10_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_log10_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := log10(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_log10_ps (__m128 a)

Synopsis

__m128 _mm_log10_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := log10(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_log10_ps (__m256 a)

Synopsis

__m256 _mm256_log10_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := log10(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_log10_ps (__m512 a)

Synopsis

__m512 _mm512_log10_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := log10(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_log10_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_log10_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := log10(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_log1p_pd (__m128d a)

Synopsis

__m128d _mm_log1p_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ln(1.0 + a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_log1p_pd (__m256d a)

Synopsis

__m256d _mm256_log1p_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ln(1.0 + a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_log1p_pd (__m512d a)

Synopsis

__m512d _mm512_log1p_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ln(1.0 + a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_log1p_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_log1p_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ln(1.0 + a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_log1p_ps (__m128 a)

Synopsis

__m128 _mm_log1p_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ln(1.0 + a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_log1p_ps (__m256 a)

Synopsis

__m256 _mm256_log1p_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ln(1.0 + a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_log1p_ps (__m512 a)

Synopsis

__m512 _mm512_log1p_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ln(1.0 + a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_log1p_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_log1p_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ln(1.0 + a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_log2_pd (__m128d a)

Synopsis

__m128d _mm_log2_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := log2(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_log2_pd (__m256d a)

Synopsis

__m256d _mm256_log2_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := log2(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_log2_pd (__m512d a)

Synopsis

__m512d _mm512_log2_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := log2(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_log2_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_log2_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := log2(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_log2_ps (__m128 a)

Synopsis

__m128 _mm_log2_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := log2(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_log2_ps (__m256 a)

Synopsis

__m256 _mm256_log2_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := log2(a[i+31:i]) ENDFOR dst[MAX:256] := 0
vlog2ps
__m512 _mm512_log2_ps (__m512 a)

Synopsis

__m512 _mm512_log2_ps (__m512 a)
#include "immintrin.h"
Instruction: vlog2ps zmm {k}, zmm
CPUID Flags: KNCNI

Description

Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := log2(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vlog2ps
__m512 _mm512_mask_log2_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_log2_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vlog2ps zmm {k}, zmm
CPUID Flags: KNCNI

Description

Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := log2(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vlog2ps
__m512 _mm512_log2ae23_ps (__m512 a)

Synopsis

__m512 _mm512_log2ae23_ps (__m512 a)
#include "immintrin.h"
Instruction: vlog2ps zmm {k}, m512
CPUID Flags: KNCNI

Description

Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in a with absolute error of 2^(-23) and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := Log2ae23(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vlog2ps
__m512 _mm512_mask_log2ae23_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_log2ae23_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vlog2ps zmm {k}, m512
CPUID Flags: KNCNI

Description

Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in a with absolute error of 2^(-23) and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Log2ae23(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m128d _mm_logb_pd (__m128d a)

Synopsis

__m128d _mm_logb_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_logb_pd (__m256d a)

Synopsis

__m256d _mm256_logb_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_logb_pd (__m512d a)

Synopsis

__m512d _mm512_logb_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_logb_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_logb_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m128 _mm_logb_ps (__m128 a)

Synopsis

__m128 _mm_logb_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_logb_ps (__m256 a)

Synopsis

__m256 _mm256_logb_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_logb_ps (__m512 a)

Synopsis

__m512 _mm512_logb_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_logb_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_logb_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
rol
unsigned long _lrotl (unsigned long a, int shift)

Synopsis

unsigned long _lrotl (unsigned long a, int shift)
#include "immintrin.h"
Instruction: rol r64, imm

Description

Shift the bits of unsigned 64-bit integer a left by the number of bits specified in shift, rotating the most-significant bit to the least-significant bit location, and store the unsigned result in dst.

Operation

dst := a count := shift BITWISE AND 63 DO WHILE (count > 0) tmp[0] := dst[63] dst := (dst << 1) OR tmp[0] count := count - 1 OD

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge2-
Westmere1-
Nehalem1-
ror
unsigned long _lrotr (unsigned long a, int shift)

Synopsis

unsigned long _lrotr (unsigned long a, int shift)
#include "immintrin.h"
Instruction: ror r64, imm

Description

Shift the bits of unsigned 64-bit integer a right by the number of bits specified in shift, rotating the least-significant bit to the most-significant bit location, and store the unsigned result in dst.

Operation

dst := a count := shift BITWISE AND 63 DO WHILE (count > 0) tmp[63] := dst[0] dst := (dst >> 1) OR tmp[63] count := count - 1 OD

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge2-
Westmere1-
Nehalem1-
vplzcntd
__m128i _mm_lzcnt_epi32 (__m128i a)

Synopsis

__m128i _mm_lzcnt_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vplzcntd
CPUID Flags: AVX512VL + AVX512CD

Description

Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 tmp := 31 dst[i+31:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+31:i] := dst[i+31:i] + 1 OD ENDFOR dst[MAX:128] := 0
vplzcntd
__m128i _mm_mask_lzcnt_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_lzcnt_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vplzcntd
CPUID Flags: AVX512VL + AVX512CD

Description

Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] tmp := 31 dst[i+31:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+31:i] := dst[i+31:i] + 1 OD ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vplzcntd
__m128i _mm_maskz_lzcnt_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_lzcnt_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vplzcntd
CPUID Flags: AVX512VL + AVX512CD

Description

Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] tmp := 31 dst[i+31:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+31:i] := dst[i+31:i] + 1 OD ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vplzcntd
__m256i _mm256_lzcnt_epi32 (__m256i a)

Synopsis

__m256i _mm256_lzcnt_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vplzcntd
CPUID Flags: AVX512VL + AVX512CD

Description

Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 tmp := 31 dst[i+31:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+31:i] := dst[i+31:i] + 1 OD ENDFOR dst[MAX:256] := 0
vplzcntd
__m256i _mm256_mask_lzcnt_epi32 (__m256i src, __mmask8 k, __m256i a)

Synopsis

__m256i _mm256_mask_lzcnt_epi32 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vplzcntd
CPUID Flags: AVX512VL + AVX512CD

Description

Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] tmp := 31 dst[i+31:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+31:i] := dst[i+31:i] + 1 OD ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vplzcntd
__m256i _mm256_maskz_lzcnt_epi32 (__mmask8 k, __m256i a)

Synopsis

__m256i _mm256_maskz_lzcnt_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vplzcntd
CPUID Flags: AVX512VL + AVX512CD

Description

Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] tmp := 31 dst[i+31:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+31:i] := dst[i+31:i] + 1 OD ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vplzcntd
__m512i _mm512_lzcnt_epi32 (__m512i a)

Synopsis

__m512i _mm512_lzcnt_epi32 (__m512i a)
#include "immintrin.h"
Instruction: vplzcntd zmm {k}, zmm
CPUID Flags: AVX512CD

Description

Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 tmp := 31 dst[i+31:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+31:i] := dst[i+31:i] + 1 OD ENDFOR dst[MAX:512] := 0
vplzcntd
__m512i _mm512_mask_lzcnt_epi32 (__m512i src, __mmask16 k, __m512i a)

Synopsis

__m512i _mm512_mask_lzcnt_epi32 (__m512i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vplzcntd zmm {k}, zmm
CPUID Flags: AVX512CD

Description

Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] tmp := 31 dst[i+31:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+31:i] := dst[i+31:i] + 1 OD ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vplzcntd
__m512i _mm512_maskz_lzcnt_epi32 (__mmask16 k, __m512i a)

Synopsis

__m512i _mm512_maskz_lzcnt_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vplzcntd zmm {k}, zmm
CPUID Flags: AVX512CD

Description

Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] tmp := 31 dst[i+31:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+31:i] := dst[i+31:i] + 1 OD ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vplzcntq
__m128i _mm_lzcnt_epi64 (__m128i a)

Synopsis

__m128i _mm_lzcnt_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vplzcntq
CPUID Flags: AVX512VL + AVX512CD

Description

Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 tmp := 63 dst[i+63:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+63:i] := dst[i+63:i] + 1 OD ENDFOR dst[MAX:128] := 0
vplzcntq
__m128i _mm_mask_lzcnt_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_lzcnt_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vplzcntq
CPUID Flags: AVX512VL + AVX512CD

Description

Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] tmp := 63 dst[i+63:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+63:i] := dst[i+63:i] + 1 OD ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vplzcntq
__m128i _mm_maskz_lzcnt_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_lzcnt_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vplzcntq
CPUID Flags: AVX512VL + AVX512CD

Description

Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] tmp := 63 dst[i+63:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+63:i] := dst[i+63:i] + 1 OD ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vplzcntq
__m256i _mm256_lzcnt_epi64 (__m256i a)

Synopsis

__m256i _mm256_lzcnt_epi64 (__m256i a)
#include "immintrin.h"
Instruction: vplzcntq
CPUID Flags: AVX512VL + AVX512CD

Description

Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 tmp := 63 dst[i+63:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+63:i] := dst[i+63:i] + 1 OD ENDFOR dst[MAX:256] := 0
vplzcntq
__m256i _mm256_mask_lzcnt_epi64 (__m256i src, __mmask8 k, __m256i a)

Synopsis

__m256i _mm256_mask_lzcnt_epi64 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vplzcntq
CPUID Flags: AVX512VL + AVX512CD

Description

Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] tmp := 63 dst[i+63:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+63:i] := dst[i+63:i] + 1 OD ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vplzcntq
__m256i _mm256_maskz_lzcnt_epi64 (__mmask8 k, __m256i a)

Synopsis

__m256i _mm256_maskz_lzcnt_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vplzcntq
CPUID Flags: AVX512VL + AVX512CD

Description

Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] tmp := 63 dst[i+63:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+63:i] := dst[i+63:i] + 1 OD ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vplzcntq
__m512i _mm512_lzcnt_epi64 (__m512i a)

Synopsis

__m512i _mm512_lzcnt_epi64 (__m512i a)
#include "immintrin.h"
Instruction: vplzcntq zmm {k}, zmm
CPUID Flags: AVX512CD

Description

Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 tmp := 63 dst[i+63:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+63:i] := dst[i+63:i] + 1 OD ENDFOR dst[MAX:512] := 0
vplzcntq
__m512i _mm512_mask_lzcnt_epi64 (__m512i src, __mmask8 k, __m512i a)

Synopsis

__m512i _mm512_mask_lzcnt_epi64 (__m512i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vplzcntq zmm {k}, zmm
CPUID Flags: AVX512CD

Description

Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] tmp := 63 dst[i+63:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+63:i] := dst[i+63:i] + 1 OD ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vplzcntq
__m512i _mm512_maskz_lzcnt_epi64 (__mmask8 k, __m512i a)

Synopsis

__m512i _mm512_maskz_lzcnt_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vplzcntq zmm {k}, zmm
CPUID Flags: AVX512CD

Description

Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] tmp := 63 dst[i+63:i] := 0 DO WHILE (tmp >= 0 AND a[i+tmp] == 0) tmp := tmp - 1 dst[i+63:i] := dst[i+63:i] + 1 OD ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
lzcnt
unsigned int _lzcnt_u32 (unsigned int a)

Synopsis

unsigned int _lzcnt_u32 (unsigned int a)
#include "immintrin.h"
Instruction: lzcnt r32, r32
CPUID Flags: LZCNT

Description

Count the number of leading zero bits in unsigned 32-bit integer a, and return that count in dst.

Operation

tmp := 31 dst := 0 DO WHILE (tmp >= 0 AND a[tmp] = 0) tmp := tmp - 1 dst := dst + 1 OD

Performance

ArchitectureLatencyThroughput
Haswell3-
lzcnt
unsigned __int64 _lzcnt_u64 (unsigned __int64 a)

Synopsis

unsigned __int64 _lzcnt_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: lzcnt r64, r64
CPUID Flags: LZCNT

Description

Count the number of leading zero bits in unsigned 64-bit integer a, and return that count in dst.

Operation

tmp := 63 dst := 0 DO WHILE (tmp >= 0 AND a[tmp] = 0) tmp := tmp - 1 dst := dst + 1 OD

Performance

ArchitectureLatencyThroughput
Haswell3-
pmaddwd
__m128i _mm_madd_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_madd_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmaddwd xmm, xmm
CPUID Flags: SSE2

Description

Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst.

Operation

FOR j := 0 to 3 i := j*32 st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vpmaddwd
__m128i _mm_mask_madd_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_madd_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpmaddwd
__m128i _mm_maskz_madd_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_madd_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmaddwd
__m256i _mm256_madd_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_madd_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaddwd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst.

Operation

FOR j := 0 to 7 i := j*32 st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell51
vpmaddwd
__m256i _mm256_mask_madd_epi16 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_madd_epi16 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpmaddwd
__m256i _mm256_maskz_madd_epi16 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_madd_epi16 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmaddwd
__m512i _mm512_madd_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_madd_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst.

Operation

FOR j := 0 to 15 i := j*32 st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i] ENDFOR dst[MAX:512] := 0
vpmaddwd
__m512i _mm512_mask_madd_epi16 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_madd_epi16 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmaddwd
__m512i _mm512_maskz_madd_epi16 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_madd_epi16 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmadd52huq
__m128i _mm_madd52hi_epu64 (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i _mm_madd52hi_epu64 (__m128i a, __m128i b, __m128i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52 + AVX512VL

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ENDFOR dst[MAX:128] := 0
vpmadd52huq
__m128i _mm_mask_madd52hi_epu64 (__m128i a, __mmask8 k, __m128i b, __m128i c)

Synopsis

__m128i _mm_mask_madd52hi_epu64 (__m128i a, __mmask8 k, __m128i b, __m128i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52 + AVX512VL

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmadd52huq
__m128i _mm_maskz_madd52hi_epu64 (__mmask8 k, __m128i a, __m128i b, __m128i c)

Synopsis

__m128i _mm_maskz_madd52hi_epu64 (__mmask8 k, __m128i a, __m128i b, __m128i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52 + AVX512VL

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmadd52huq
__m256i _mm256_madd52hi_epu64 (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i _mm256_madd52hi_epu64 (__m256i a, __m256i b, __m256i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52 + AVX512VL

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ENDFOR dst[MAX:256] := 0
vpmadd52huq
__m256i _mm256_mask_madd52hi_epu64 (__m256i a, __mmask8 k, __m256i b, __m256i c)

Synopsis

__m256i _mm256_mask_madd52hi_epu64 (__m256i a, __mmask8 k, __m256i b, __m256i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52 + AVX512VL

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmadd52huq
__m256i _mm256_maskz_madd52hi_epu64 (__mmask8 k, __m256i a, __m256i b, __m256i c)

Synopsis

__m256i _mm256_maskz_madd52hi_epu64 (__mmask8 k, __m256i a, __m256i b, __m256i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52 + AVX512VL

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmadd52huq
__m512i _mm512_madd52hi_epu64 (__m512i a, __m512i b, __m512i c)

Synopsis

__m512i _mm512_madd52hi_epu64 (__m512i a, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ENDFOR dst[MAX:512] := 0
vpmadd52huq
__m512i _mm512_mask_madd52hi_epu64 (__m512i a, __mmask8 k, __m512i b, __m512i c)

Synopsis

__m512i _mm512_mask_madd52hi_epu64 (__m512i a, __mmask8 k, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmadd52huq
__m512i _mm512_maskz_madd52hi_epu64 (__mmask8 k, __m512i a, __m512i b, __m512i c)

Synopsis

__m512i _mm512_maskz_madd52hi_epu64 (__mmask8 k, __m512i a, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmadd52luq
__m128i _mm_madd52lo_epu64 (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i _mm_madd52lo_epu64 (__m128i a, __m128i b, __m128i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52 + AVX512VL

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ENDFOR dst[MAX:128] := 0
vpmadd52luq
__m128i _mm_mask_madd52lo_epu64 (__m128i a, __mmask8 k, __m128i b, __m128i c)

Synopsis

__m128i _mm_mask_madd52lo_epu64 (__m128i a, __mmask8 k, __m128i b, __m128i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52 + AVX512VL

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmadd52luq
__m128i _mm_maskz_madd52lo_epu64 (__mmask8 k, __m128i a, __m128i b, __m128i c)

Synopsis

__m128i _mm_maskz_madd52lo_epu64 (__mmask8 k, __m128i a, __m128i b, __m128i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52 + AVX512VL

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmadd52luq
__m256i _mm256_madd52lo_epu64 (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i _mm256_madd52lo_epu64 (__m256i a, __m256i b, __m256i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52 + AVX512VL

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ENDFOR dst[MAX:256] := 0
vpmadd52luq
__m256i _mm256_mask_madd52lo_epu64 (__m256i a, __mmask8 k, __m256i b, __m256i c)

Synopsis

__m256i _mm256_mask_madd52lo_epu64 (__m256i a, __mmask8 k, __m256i b, __m256i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52 + AVX512VL

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmadd52luq
__m256i _mm256_maskz_madd52lo_epu64 (__mmask8 k, __m256i a, __m256i b, __m256i c)

Synopsis

__m256i _mm256_maskz_madd52lo_epu64 (__mmask8 k, __m256i a, __m256i b, __m256i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52 + AVX512VL

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmadd52luq
__m512i _mm512_madd52lo_epu64 (__m512i a, __m512i b, __m512i c)

Synopsis

__m512i _mm512_madd52lo_epu64 (__m512i a, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ENDFOR dst[MAX:512] := 0
vpmadd52luq
__m512i _mm512_mask_madd52lo_epu64 (__m512i a, __mmask8 k, __m512i b, __m512i c)

Synopsis

__m512i _mm512_mask_madd52lo_epu64 (__m512i a, __mmask8 k, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmadd52luq
__m512i _mm512_maskz_madd52lo_epu64 (__mmask8 k, __m512i a, __m512i b, __m512i c)

Synopsis

__m512i _mm512_maskz_madd52lo_epu64 (__mmask8 k, __m512i a, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52

Description

Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
pmaddubsw
__m128i _mm_maddubs_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_maddubs_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: pmaddubsw xmm, xmm
CPUID Flags: SSSE3

Description

Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vpmaddubsw
__m128i _mm_mask_maddubs_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_maddubs_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpmaddubsw
__m128i _mm_maskz_maddubs_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_maddubs_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmaddubsw
__m256i _mm256_maddubs_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_maddubs_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaddubsw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell51
vpmaddubsw
__m256i _mm256_mask_maddubs_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_maddubs_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpmaddubsw
__m256i _mm256_maskz_maddubs_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_maddubs_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmaddubsw
__m512i _mm512_maddubs_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_maddubs_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512BW

Description

Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.

Operation

FOR j := 0 to 31 i := j*16 dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ENDFOR dst[MAX:512] := 0
vpmaddubsw
__m512i _mm512_mask_maddubs_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_maddubs_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512BW

Description

Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpmaddubsw
__m512i _mm512_maskz_maddubs_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_maddubs_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512BW

Description

Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
pmaddubsw
__m64 _mm_maddubs_pi16 (__m64 a, __m64 b)

Synopsis

__m64 _mm_maddubs_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: pmaddubsw mm, mm
CPUID Flags: SSSE3

Description

Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.

Operation

FOR j := 0 to 3 i := j*16 dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ENDFOR
void* _mm_malloc (size_t size, size_t align)

Synopsis

void* _mm_malloc (size_t size, size_t align)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Allocate size bytes of memory, aligned to the alignment specified in align, and return a pointer to the allocated memory. _mm_free should be used to free memory that is allocated with _mm_malloc.
kmov
int _mm512_mask2int (__mmask16 k1)

Synopsis

int _mm512_mask2int (__mmask16 k1)
#include "immintrin.h"
Instruction: kmov r32, k
CPUID Flags: KNCNI

Description

Converts bit mask k1 into an integer value, storing the results in dst.

Operation

dst := SignExtend(k1)
vpmaskmovd
__m128i _mm_maskload_epi32 (int const* mem_addr, __m128i mask)

Synopsis

__m128i _mm_maskload_epi32 (int const* mem_addr, __m128i mask)
#include "immintrin.h"
Instruction: vpmaskmovd xmm, xmm, m128
CPUID Flags: AVX2

Description

Load packed 32-bit integers from memory into dst using mask (elements are zeroed out when the highest bit is not set in the corresponding element).

Operation

FOR j := 0 to 3 i := j*32 IF mask[i+31] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
vpmaskmovd
__m256i _mm256_maskload_epi32 (int const* mem_addr, __m256i mask)

Synopsis

__m256i _mm256_maskload_epi32 (int const* mem_addr, __m256i mask)
#include "immintrin.h"
Instruction: vpmaskmovd ymm, ymm, m256
CPUID Flags: AVX2

Description

Load packed 32-bit integers from memory into dst using mask (elements are zeroed out when the highest bit is not set in the corresponding element).

Operation

FOR j := 0 to 7 i := j*32 IF mask[i+31] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
vpmaskmovq
__m128i _mm_maskload_epi64 (__int64 const* mem_addr, __m128i mask)

Synopsis

__m128i _mm_maskload_epi64 (__int64 const* mem_addr, __m128i mask)
#include "immintrin.h"
Instruction: vpmaskmovq xmm, xmm, m128
CPUID Flags: AVX2

Description

Load packed 64-bit integers from memory into dst using mask (elements are zeroed out when the highest bit is not set in the corresponding element).

Operation

FOR j := 0 to 1 i := j*64 IF mask[i+63] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
vpmaskmovq
__m256i _mm256_maskload_epi64 (__int64 const* mem_addr, __m256i mask)

Synopsis

__m256i _mm256_maskload_epi64 (__int64 const* mem_addr, __m256i mask)
#include "immintrin.h"
Instruction: vpmaskmovq ymm, ymm, m256
CPUID Flags: AVX2

Description

Load packed 64-bit integers from memory into dst using mask (elements are zeroed out when the highest bit is not set in the corresponding element).

Operation

FOR j := 0 to 3 i := j*64 IF mask[i+63] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
vmaskmovpd
__m128d _mm_maskload_pd (double const * mem_addr, __m128i mask)

Synopsis

__m128d _mm_maskload_pd (double const * mem_addr, __m128i mask)
#include "immintrin.h"
Instruction: vmaskmovpd xmm, xmm, m128
CPUID Flags: AVX

Description

Load packed double-precision (64-bit) floating-point elements from memory into dst using mask (elements are zeroed out when the high bit of the corresponding element is not set).

Operation

FOR j := 0 to 1 i := j*64 IF mask[i+63] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
vmaskmovpd
__m256d _mm256_maskload_pd (double const * mem_addr, __m256i mask)

Synopsis

__m256d _mm256_maskload_pd (double const * mem_addr, __m256i mask)
#include "immintrin.h"
Instruction: vmaskmovpd ymm, ymm, m256
CPUID Flags: AVX

Description

Load packed double-precision (64-bit) floating-point elements from memory into dst using mask (elements are zeroed out when the high bit of the corresponding element is not set).

Operation

FOR j := 0 to 3 i := j*64 IF mask[i+63] dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
vmaskmovps
__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)

Synopsis

__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
#include "immintrin.h"
Instruction: vmaskmovps xmm, xmm, m128
CPUID Flags: AVX

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using mask (elements are zeroed out when the high bit of the corresponding element is not set).

Operation

FOR j := 0 to 3 i := j*32 IF mask[i+31] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
vmaskmovps
__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)

Synopsis

__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
#include "immintrin.h"
Instruction: vmaskmovps ymm, ymm, m256
CPUID Flags: AVX

Description

Load packed single-precision (32-bit) floating-point elements from memory into dst using mask (elements are zeroed out when the high bit of the corresponding element is not set).

Operation

FOR j := 0 to 7 i := j*32 IF mask[i+31] dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
Ivy Bridge2-
Sandy Bridge2-
maskmovq
void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr)

Synopsis

void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr)
#include "xmmintrin.h"
Instruction: maskmovq mm, mm
CPUID Flags: SSE

Description

Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint.

Operation

FOR j := 0 to 7 i := j*8 IF mask[i+7] MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] FI ENDFOR
maskmovdqu
void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)

Synopsis

void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
#include "emmintrin.h"
Instruction: maskmovdqu xmm, xmm
CPUID Flags: SSE2

Description

Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*8 IF mask[i+7] MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] FI ENDFOR
maskmovq
void _m_maskmovq (__m64 a, __m64 mask, char* mem_addr)

Synopsis

void _m_maskmovq (__m64 a, __m64 mask, char* mem_addr)
#include "xmmintrin.h"
Instruction: maskmovq mm, mm
CPUID Flags: SSE

Description

Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element).

Operation

FOR j := 0 to 7 i := j*8 IF mask[i+7] MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] FI ENDFOR
vpmaskmovd
void _mm_maskstore_epi32 (int* mem_addr, __m128i mask, __m128i a)

Synopsis

void _mm_maskstore_epi32 (int* mem_addr, __m128i mask, __m128i a)
#include "immintrin.h"
Instruction: vpmaskmovd m128, xmm, xmm
CPUID Flags: AVX2

Description

Store packed 32-bit integers from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element).

Operation

FOR j := 0 to 3 i := j*32 IF mask[i+31] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell4-
vpmaskmovd
void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)

Synopsis

void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
#include "immintrin.h"
Instruction: vpmaskmovd m256, ymm, ymm
CPUID Flags: AVX2

Description

Store packed 32-bit integers from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element).

Operation

FOR j := 0 to 7 i := j*32 IF mask[i+31] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell4-
vpmaskmovq
void _mm_maskstore_epi64 (__int64* mem_addr, __m128i mask, __m128i a)

Synopsis

void _mm_maskstore_epi64 (__int64* mem_addr, __m128i mask, __m128i a)
#include "immintrin.h"
Instruction: vpmaskmovq m128, xmm, xmm
CPUID Flags: AVX2

Description

Store packed 64-bit integers from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element).

Operation

FOR j := 0 to 1 i := j*64 IF mask[i+63] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell4-
vpmaskmovq
void _mm256_maskstore_epi64 (__int64* mem_addr, __m256i mask, __m256i a)

Synopsis

void _mm256_maskstore_epi64 (__int64* mem_addr, __m256i mask, __m256i a)
#include "immintrin.h"
Instruction: vpmaskmovq m256, ymm, ymm
CPUID Flags: AVX2

Description

Store packed 64-bit integers from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element).

Operation

FOR j := 0 to 3 i := j*64 IF mask[i+63] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell4-
vmaskmovpd
void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a)

Synopsis

void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a)
#include "immintrin.h"
Instruction: vmaskmovpd m128, xmm, xmm
CPUID Flags: AVX

Description

Store packed double-precision (64-bit) floating-point elements from a into memory using mask.

Operation

FOR j := 0 to 1 i := j*64 IF mask[i+63] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell4-
Ivy Bridge1-
Sandy Bridge1-
vmaskmovpd
void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a)

Synopsis

void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a)
#include "immintrin.h"
Instruction: vmaskmovpd m256, ymm, ymm
CPUID Flags: AVX

Description

Store packed double-precision (64-bit) floating-point elements from a into memory using mask.

Operation

FOR j := 0 to 3 i := j*64 IF mask[i+63] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell4-
Ivy Bridge1-
Sandy Bridge1-
vmaskmovps
void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)

Synopsis

void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
#include "immintrin.h"
Instruction: vmaskmovps m128, xmm, xmm
CPUID Flags: AVX

Description

Store packed single-precision (32-bit) floating-point elements from a into memory using mask.

Operation

FOR j := 0 to 3 i := j*32 IF mask[i+31] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell4-
Ivy Bridge1-
Sandy Bridge1-
vmaskmovps
void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)

Synopsis

void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
#include "immintrin.h"
Instruction: vmaskmovps m256, ymm, ymm
CPUID Flags: AVX

Description

Store packed single-precision (32-bit) floating-point elements from a into memory using mask.

Operation

FOR j := 0 to 7 i := j*32 IF mask[i+31] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell4-
Ivy Bridge1-
Sandy Bridge1-
vpmaxsw
__m128i _mm_mask_max_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_max_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpmaxsw
__m128i _mm_maskz_max_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_max_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
pmaxsw
__m128i _mm_max_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_max_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmaxsw xmm, xmm
CPUID Flags: SSE2

Description

Compare packed 16-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 7 i := j*16 IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
vpmaxsw
__m256i _mm256_mask_max_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_max_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpmaxsw
__m256i _mm256_maskz_max_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_max_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmaxsw
__m256i _mm256_max_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_max_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 16-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 15 i := j*16 IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpmaxsw
__m512i _mm512_mask_max_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_max_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpmaxsw
__m512i _mm512_maskz_max_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_max_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmaxsw
__m512i _mm512_max_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_max_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 31 i := j*16 IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpmaxsd
__m128i _mm_mask_max_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_max_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpmaxsd
__m128i _mm_maskz_max_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_max_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
pmaxsd
__m128i _mm_max_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_max_epi32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pmaxsd xmm, xmm
CPUID Flags: SSE4.1

Description

Compare packed 32-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 3 i := j*32 IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmaxsd
__m256i _mm256_mask_max_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_max_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpmaxsd
__m256i _mm256_maskz_max_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_max_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmaxsd
__m256i _mm256_max_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_max_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 32-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 7 i := j*32 IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpmaxsd
__m512i _mm512_mask_max_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_max_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmaxsd
__m512i _mm512_maskz_max_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_max_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmaxsd
__m512i _mm512_max_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_max_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 15 i := j*32 IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmaxsq
__m128i _mm_mask_max_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_max_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmaxsq
__m128i _mm_maskz_max_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_max_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmaxsq
__m128i _mm_max_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_max_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 1 i := j*64 IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmaxsq
__m256i _mm256_mask_max_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_max_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmaxsq
__m256i _mm256_maskz_max_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_max_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmaxsq
__m256i _mm256_max_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_max_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 3 i := j*64 IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmaxsq
__m512i _mm512_mask_max_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_max_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmaxsq
__m512i _mm512_maskz_max_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_max_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmaxsq
__m512i _mm512_max_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_max_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 7 i := j*64 IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmaxsb
__m128i _mm_mask_max_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_max_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpmaxsb
__m128i _mm_maskz_max_epi8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_max_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
pmaxsb
__m128i _mm_max_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_max_epi8 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pmaxsb xmm, xmm
CPUID Flags: SSE4.1

Description

Compare packed 8-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 15 i := j*8 IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmaxsb
__m256i _mm256_mask_max_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_max_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpmaxsb
__m256i _mm256_maskz_max_epi8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_max_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmaxsb
__m256i _mm256_max_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_max_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 8-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 31 i := j*8 IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpmaxsb
__m512i _mm512_mask_max_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_max_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpmaxsb
__m512i _mm512_maskz_max_epi8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_max_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmaxsb
__m512i _mm512_max_epi8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_max_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 63 i := j*8 IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpmaxuw
__m128i _mm_mask_max_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_max_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpmaxuw
__m128i _mm_maskz_max_epu16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_max_epu16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
pmaxuw
__m128i _mm_max_epu16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_max_epu16 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pmaxuw xmm, xmm
CPUID Flags: SSE4.1

Description

Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 7 i := j*16 IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmaxuw
__m256i _mm256_mask_max_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_max_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpmaxuw
__m256i _mm256_maskz_max_epu16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_max_epu16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmaxuw
__m256i _mm256_max_epu16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_max_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxuw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 15 i := j*16 IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpmaxuw
__m512i _mm512_mask_max_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_max_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpmaxuw
__m512i _mm512_maskz_max_epu16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_max_epu16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmaxuw
__m512i _mm512_max_epu16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_max_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 31 i := j*16 IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpmaxud
__m128i _mm_mask_max_epu32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_max_epu32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpmaxud
__m128i _mm_maskz_max_epu32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_max_epu32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
pmaxud
__m128i _mm_max_epu32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_max_epu32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pmaxud xmm, xmm
CPUID Flags: SSE4.1

Description

Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 3 i := j*32 IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpmaxud
__m256i _mm256_mask_max_epu32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_max_epu32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpmaxud
__m256i _mm256_maskz_max_epu32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_max_epu32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmaxud
__m256i _mm256_max_epu32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_max_epu32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxud ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 7 i := j*32 IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
vpmaxud
__m512i _mm512_mask_max_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_max_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxud zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmaxud
__m512i _mm512_maskz_max_epu32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_max_epu32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxud zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmaxud
__m512i _mm512_max_epu32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_max_epu32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxud zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 15 i := j*32 IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmaxuq
__m128i _mm_mask_max_epu64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_max_epu64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmaxuq
__m128i _mm_maskz_max_epu64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_max_epu64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmaxuq
__m128i _mm_max_epu64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_max_epu64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 1 i := j*64 IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmaxuq
__m256i _mm256_mask_max_epu64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_max_epu64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmaxuq
__m256i _mm256_maskz_max_epu64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_max_epu64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmaxuq
__m256i _mm256_max_epu64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_max_epu64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 3 i := j*64 IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmaxuq
__m512i _mm512_mask_max_epu64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_max_epu64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxuq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmaxuq
__m512i _mm512_maskz_max_epu64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_max_epu64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxuq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmaxuq
__m512i _mm512_max_epu64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_max_epu64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxuq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 7 i := j*64 IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmaxub
__m128i _mm_mask_max_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_max_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpmaxub
__m128i _mm_maskz_max_epu8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_max_epu8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
pmaxub
__m128i _mm_max_epu8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_max_epu8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmaxub xmm, xmm
CPUID Flags: SSE2

Description

Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 15 i := j*8 IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
vpmaxub
__m256i _mm256_mask_max_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_max_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpmaxub
__m256i _mm256_maskz_max_epu8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_max_epu8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmaxub
__m256i _mm256_max_epu8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_max_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxub ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 31 i := j*8 IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpmaxub
__m512i _mm512_mask_max_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_max_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpmaxub
__m512i _mm512_maskz_max_epu8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_max_epu8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmaxub
__m512i _mm512_max_epu8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_max_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 63 i := j*8 IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR dst[MAX:512] := 0
vmaxpd
__m128d _mm_mask_max_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_max_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmaxpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vmaxpd
__m128d _mm_maskz_max_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_max_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmaxpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
maxpd
__m128d _mm_max_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_max_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: maxpd xmm, xmm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vmaxpd
__m256d _mm256_mask_max_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_max_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vmaxpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vmaxpd
__m256d _mm256_maskz_max_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_max_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vmaxpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmaxpd
__m256d _mm256_max_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_max_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vmaxpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vmaxpd
__m512d _mm512_mask_max_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_max_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vmaxpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmaxpd
__m512d _mm512_maskz_max_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_max_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vmaxpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmaxpd
__m512d _mm512_max_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_max_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vmaxpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
pmaxsw
__m64 _mm_max_pi16 (__m64 a, __m64 b)

Synopsis

__m64 _mm_max_pi16 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pmaxsw mm, mm
CPUID Flags: SSE

Description

Compare packed 16-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 3 i := j*16 IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
vmaxps
__m128 _mm_mask_max_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_max_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmaxps
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vmaxps
__m128 _mm_maskz_max_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_max_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmaxps
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
maxps
__m128 _mm_max_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_max_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: maxps xmm, xmm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vmaxps
__m256 _mm256_mask_max_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_max_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vmaxps
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vmaxps
__m256 _mm256_maskz_max_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_max_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vmaxps
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmaxps
__m256 _mm256_max_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_max_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vmaxps ymm, ymm, ymm
CPUID Flags: AVX

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vmaxps
__m512 _mm512_mask_max_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_max_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vmaxps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmaxps
__m512 _mm512_maskz_max_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_max_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vmaxps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmaxps
__m512 _mm512_max_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_max_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vmaxps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
pmaxub
__m64 _mm_max_pu8 (__m64 a, __m64 b)

Synopsis

__m64 _mm_max_pu8 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pmaxub mm, mm
CPUID Flags: SSE

Description

Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 7 i := j*8 IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
vmaxpd
__m512d _mm512_mask_max_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int sae)

Synopsis

__m512d _mm512_mask_max_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int sae)
#include "immintrin.h"
Instruction: vmaxpd zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmaxpd
__m512d _mm512_maskz_max_round_pd (__mmask8 k, __m512d a, __m512d b, int sae)

Synopsis

__m512d _mm512_maskz_max_round_pd (__mmask8 k, __m512d a, __m512d b, int sae)
#include "immintrin.h"
Instruction: vmaxpd zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmaxpd
__m512d _mm512_max_round_pd (__m512d a, __m512d b, int sae)

Synopsis

__m512d _mm512_max_round_pd (__m512d a, __m512d b, int sae)
#include "immintrin.h"
Instruction: vmaxpd zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
vmaxps
__m512 _mm512_mask_max_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int sae)

Synopsis

__m512 _mm512_mask_max_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int sae)
#include "immintrin.h"
Instruction: vmaxps zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmaxps
__m512 _mm512_maskz_max_round_ps (__mmask16 k, __m512 a, __m512 b, int sae)

Synopsis

__m512 _mm512_maskz_max_round_ps (__mmask16 k, __m512 a, __m512 b, int sae)
#include "immintrin.h"
Instruction: vmaxps zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmaxps
__m512 _mm512_max_round_ps (__m512 a, __m512 b, int sae)

Synopsis

__m512 _mm512_max_round_ps (__m512 a, __m512 b, int sae)
#include "immintrin.h"
Instruction: vmaxps zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
vmaxsd
__m128d _mm_mask_max_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int sae)

Synopsis

__m128d _mm_mask_max_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int sae)
#include "immintrin.h"
Instruction: vmaxsd xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

IF k[0] dst[63:0] := MAX(a[63:0], b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vmaxsd
__m128d _mm_maskz_max_round_sd (__mmask8 k, __m128d a, __m128d b, int sae)

Synopsis

__m128d _mm_maskz_max_round_sd (__mmask8 k, __m128d a, __m128d b, int sae)
#include "immintrin.h"
Instruction: vmaxsd xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

IF k[0] dst[63:0] := MAX(a[63:0], b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vmaxsd
__m128d _mm_max_round_sd (__m128d a, __m128d b, int sae)

Synopsis

__m128d _mm_max_round_sd (__m128d a, __m128d b, int sae)
#include "immintrin.h"
Instruction: vmaxsd xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

dst[63:0] := MAX(a[63:0], b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vmaxss
__m128 _mm_mask_max_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int sae)

Synopsis

__m128 _mm_mask_max_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int sae)
#include "immintrin.h"
Instruction: vmaxss xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

IF k[0] dst[31:0] := MAX(a[31:0], b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vmaxss
__m128 _mm_maskz_max_round_ss (__mmask8 k, __m128 a, __m128 b, int sae)

Synopsis

__m128 _mm_maskz_max_round_ss (__mmask8 k, __m128 a, __m128 b, int sae)
#include "immintrin.h"
Instruction: vmaxss xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

IF k[0] dst[31:0] := MAX(a[31:0], b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vmaxss
__m128 _mm_max_round_ss (__m128 a, __m128 b, int sae)

Synopsis

__m128 _mm_max_round_ss (__m128 a, __m128 b, int sae)
#include "immintrin.h"
Instruction: vmaxss xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

dst[31:0] := MAX(a[31:0], b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vmaxsd
__m128d _mm_mask_max_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_max_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmaxsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := MAX(a[63:0], b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vmaxsd
__m128d _mm_maskz_max_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_max_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmaxsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := MAX(a[63:0], b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
maxsd
__m128d _mm_max_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_max_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: maxsd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := MAX(a[63:0], b[63:0]) dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vmaxss
__m128 _mm_mask_max_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_max_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmaxss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[31:0] := MAX(a[31:0], b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vmaxss
__m128 _mm_maskz_max_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_max_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmaxss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[31:0] := MAX(a[31:0], b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
maxss
__m128 _mm_max_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_max_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: maxss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[31:0] := MAX(a[31:0], b[31:0]) dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vgmaxabsps
__m512 _mm512_mask_maxabs_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_maxabs_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgmaxabsps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i])) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vgmaxabsps
__m512 _mm512_maxabs_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_maxabs_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgmaxabsps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i])) ENDFOR dst[MAX:512] := 0
...
int _may_i_use_cpu_feature (unsigned __int64 a)

Synopsis

int _may_i_use_cpu_feature (unsigned __int64 a)
#include "immintrin.h"

Description

Dynamically query the processor to determine if the processor-specific feature(s) specified in a are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This intrinsic does not check the processor vendor. See the valid feature flags below:

Operation

_FEATURE_GENERIC_IA32 _FEATURE_FPU _FEATURE_CMOV _FEATURE_MMX _FEATURE_FXSAVE _FEATURE_SSE _FEATURE_SSE2 _FEATURE_SSE3 _FEATURE_SSSE3 _FEATURE_SSE4_1 _FEATURE_SSE4_2 _FEATURE_MOVBE _FEATURE_POPCNT _FEATURE_PCLMULQDQ _FEATURE_AES _FEATURE_F16C _FEATURE_AVX _FEATURE_RDRND _FEATURE_FMA _FEATURE_BMI _FEATURE_LZCNT _FEATURE_HLE _FEATURE_RTM _FEATURE_AVX2 _FEATURE_KNCNI _FEATURE_AVX512F _FEATURE_ADX _FEATURE_RDSEED _FEATURE_AVX512ER _FEATURE_AVX512PF _FEATURE_AVX512CD _FEATURE_SHA _FEATURE_MPX
mfence
void _mm_mfence (void)

Synopsis

void _mm_mfence (void)
#include "emmintrin.h"
Instruction: mfence
CPUID Flags: SSE2

Description

Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order.

Performance

ArchitectureLatencyThroughput
Westmere1-
Nehalem1-
vpminsw
__m128i _mm_mask_min_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_min_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpminsw
__m128i _mm_maskz_min_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_min_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
pminsw
__m128i _mm_min_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_min_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pminsw xmm, xmm
CPUID Flags: SSE2

Description

Compare packed 16-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 7 i := j*16 IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
vpminsw
__m256i _mm256_mask_min_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_min_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpminsw
__m256i _mm256_maskz_min_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_min_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpminsw
__m256i _mm256_min_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_min_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 16-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 15 i := j*16 IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpminsw
__m512i _mm512_mask_min_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_min_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpminsw
__m512i _mm512_maskz_min_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_min_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpminsw
__m512i _mm512_min_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_min_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512BW

Description

Compare packed 16-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 31 i := j*16 IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpminsd
__m128i _mm_mask_min_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_min_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpminsd
__m128i _mm_maskz_min_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_min_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
pminsd
__m128i _mm_min_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_min_epi32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pminsd xmm, xmm
CPUID Flags: SSE4.1

Description

Compare packed 32-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 3 i := j*32 IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpminsd
__m256i _mm256_mask_min_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_min_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpminsd
__m256i _mm256_maskz_min_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_min_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpminsd
__m256i _mm256_min_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_min_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 32-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 7 i := j*32 IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpminsd
__m512i _mm512_mask_min_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_min_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpminsd
__m512i _mm512_maskz_min_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_min_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpminsd
__m512i _mm512_min_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_min_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed 32-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 15 i := j*32 IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpminsq
__m128i _mm_mask_min_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_min_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpminsq
__m128i _mm_maskz_min_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_min_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpminsq
__m128i _mm_min_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_min_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 1 i := j*64 IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpminsq
__m256i _mm256_mask_min_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_min_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpminsq
__m256i _mm256_maskz_min_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_min_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpminsq
__m256i _mm256_min_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_min_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 3 i := j*64 IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpminsq
__m512i _mm512_mask_min_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_min_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpminsq
__m512i _mm512_maskz_min_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_min_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpminsq
__m512i _mm512_min_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_min_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed 64-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 7 i := j*64 IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpminsb
__m128i _mm_mask_min_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_min_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpminsb
__m128i _mm_maskz_min_epi8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_min_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
pminsb
__m128i _mm_min_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_min_epi8 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pminsb xmm, xmm
CPUID Flags: SSE4.1

Description

Compare packed 8-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 15 i := j*8 IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpminsb
__m256i _mm256_mask_min_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_min_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpminsb
__m256i _mm256_maskz_min_epi8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_min_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpminsb
__m256i _mm256_min_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_min_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed 8-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 31 i := j*8 IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpminsb
__m512i _mm512_mask_min_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_min_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpminsb
__m512i _mm512_maskz_min_epi8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_min_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpminsb
__m512i _mm512_min_epi8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_min_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512BW

Description

Compare packed 8-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 63 i := j*8 IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpminuw
__m128i _mm_mask_min_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_min_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpminuw
__m128i _mm_maskz_min_epu16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_min_epu16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
pminuw
__m128i _mm_min_epu16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_min_epu16 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pminuw xmm, xmm
CPUID Flags: SSE4.1

Description

Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 7 i := j*16 IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpminuw
__m256i _mm256_mask_min_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_min_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpminuw
__m256i _mm256_maskz_min_epu16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_min_epu16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpminuw
__m256i _mm256_min_epu16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_min_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminuw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 15 i := j*16 IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpminuw
__m512i _mm512_mask_min_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_min_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpminuw
__m512i _mm512_maskz_min_epu16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_min_epu16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpminuw
__m512i _mm512_min_epu16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_min_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512BW

Description

Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 31 i := j*16 IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpminud
__m128i _mm_mask_min_epu32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_min_epu32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpminud
__m128i _mm_maskz_min_epu32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_min_epu32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
pminud
__m128i _mm_min_epu32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_min_epu32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pminud xmm, xmm
CPUID Flags: SSE4.1

Description

Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 3 i := j*32 IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpminud
__m256i _mm256_mask_min_epu32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_min_epu32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpminud
__m256i _mm256_maskz_min_epu32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_min_epu32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminud
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpminud
__m256i _mm256_min_epu32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_min_epu32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminud ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 7 i := j*32 IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpminud
__m512i _mm512_mask_min_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_min_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminud zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpminud
__m512i _mm512_maskz_min_epu32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_min_epu32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminud zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpminud
__m512i _mm512_min_epu32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_min_epu32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminud zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 15 i := j*32 IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpminuq
__m128i _mm_mask_min_epu64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_min_epu64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpminuq
__m128i _mm_maskz_min_epu64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_min_epu64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpminuq
__m128i _mm_min_epu64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_min_epu64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 1 i := j*64 IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpminuq
__m256i _mm256_mask_min_epu64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_min_epu64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpminuq
__m256i _mm256_maskz_min_epu64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_min_epu64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpminuq
__m256i _mm256_min_epu64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_min_epu64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminuq
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 3 i := j*64 IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpminuq
__m512i _mm512_mask_min_epu64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_min_epu64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminuq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpminuq
__m512i _mm512_maskz_min_epu64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_min_epu64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminuq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpminuq
__m512i _mm512_min_epu64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_min_epu64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminuq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 7 i := j*64 IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpminub
__m128i _mm_mask_min_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_min_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpminub
__m128i _mm_maskz_min_epu8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_min_epu8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
pminub
__m128i _mm_min_epu8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_min_epu8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pminub xmm, xmm
CPUID Flags: SSE2

Description

Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 15 i := j*8 IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
vpminub
__m256i _mm256_mask_min_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_min_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpminub
__m256i _mm256_maskz_min_epu8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_min_epu8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512VL + AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpminub
__m256i _mm256_min_epu8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_min_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminub ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 31 i := j*8 IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpminub
__m512i _mm512_mask_min_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_min_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpminub
__m512i _mm512_maskz_min_epu8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_min_epu8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpminub
__m512i _mm512_min_epu8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_min_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512BW

Description

Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 63 i := j*8 IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR dst[MAX:512] := 0
vminpd
__m128d _mm_mask_min_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_min_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vminpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vminpd
__m128d _mm_maskz_min_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_min_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vminpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
minpd
__m128d _mm_min_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_min_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: minpd xmm, xmm
CPUID Flags: SSE2

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vminpd
__m256d _mm256_mask_min_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_min_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vminpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vminpd
__m256d _mm256_maskz_min_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_min_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vminpd
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vminpd
__m256d _mm256_min_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_min_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vminpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vminpd
__m512d _mm512_mask_min_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_min_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vminpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vminpd
__m512d _mm512_maskz_min_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_min_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vminpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vminpd
__m512d _mm512_min_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_min_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vminpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
pminsw
__m64 _mm_min_pi16 (__m64 a, __m64 b)

Synopsis

__m64 _mm_min_pi16 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pminsw mm, mm
CPUID Flags: SSE

Description

Compare packed 16-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 3 i := j*16 IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
vminps
__m128 _mm_mask_min_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_min_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vminps
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vminps
__m128 _mm_maskz_min_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_min_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vminps
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
minps
__m128 _mm_min_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_min_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: minps xmm, xmm
CPUID Flags: SSE

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vminps
__m256 _mm256_mask_min_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_min_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vminps
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vminps
__m256 _mm256_maskz_min_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_min_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vminps
CPUID Flags: AVX512VL + AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vminps
__m256 _mm256_min_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_min_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vminps ymm, ymm, ymm
CPUID Flags: AVX

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vminps
__m512 _mm512_mask_min_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_min_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vminps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vminps
__m512 _mm512_maskz_min_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_min_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vminps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vminps
__m512 _mm512_min_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_min_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vminps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
pminub
__m64 _mm_min_pu8 (__m64 a, __m64 b)

Synopsis

__m64 _mm_min_pu8 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pminub mm, mm
CPUID Flags: SSE

Description

Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 7 i := j*8 IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
vminpd
__m512d _mm512_mask_min_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int sae)

Synopsis

__m512d _mm512_mask_min_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int sae)
#include "immintrin.h"
Instruction: vminpd zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vminpd
__m512d _mm512_maskz_min_round_pd (__mmask8 k, __m512d a, __m512d b, int sae)

Synopsis

__m512d _mm512_maskz_min_round_pd (__mmask8 k, __m512d a, __m512d b, int sae)
#include "immintrin.h"
Instruction: vminpd zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vminpd
__m512d _mm512_min_round_pd (__m512d a, __m512d b, int sae)

Synopsis

__m512d _mm512_min_round_pd (__m512d a, __m512d b, int sae)
#include "immintrin.h"
Instruction: vminpd zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F

Description

Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
vminps
__m512 _mm512_mask_min_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int sae)

Synopsis

__m512 _mm512_mask_min_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int sae)
#include "immintrin.h"
Instruction: vminps zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vminps
__m512 _mm512_maskz_min_round_ps (__mmask16 k, __m512 a, __m512 b, int sae)

Synopsis

__m512 _mm512_maskz_min_round_ps (__mmask16 k, __m512 a, __m512 b, int sae)
#include "immintrin.h"
Instruction: vminps zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vminps
__m512 _mm512_min_round_ps (__m512 a, __m512 b, int sae)

Synopsis

__m512 _mm512_min_round_ps (__m512 a, __m512 b, int sae)
#include "immintrin.h"
Instruction: vminps zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F

Description

Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
vminsd
__m128d _mm_mask_min_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int sae)

Synopsis

__m128d _mm_mask_min_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int sae)
#include "immintrin.h"
Instruction: vminsd xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

IF k[0] dst[63:0] := MIN(a[63:0], b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vminsd
__m128d _mm_maskz_min_round_sd (__mmask8 k, __m128d a, __m128d b, int sae)

Synopsis

__m128d _mm_maskz_min_round_sd (__mmask8 k, __m128d a, __m128d b, int sae)
#include "immintrin.h"
Instruction: vminsd xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

IF k[0] dst[63:0] := MIN(a[63:0], b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vminsd
__m128d _mm_min_round_sd (__m128d a, __m128d b, int sae)

Synopsis

__m128d _mm_min_round_sd (__m128d a, __m128d b, int sae)
#include "immintrin.h"
Instruction: vminsd xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

dst[63:0] := MIN(a[63:0], b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vminss
__m128 _mm_mask_min_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int sae)

Synopsis

__m128 _mm_mask_min_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int sae)
#include "immintrin.h"
Instruction: vminss xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

IF k[0] dst[31:0] := MIN(a[31:0], b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vminss
__m128 _mm_maskz_min_round_ss (__mmask8 k, __m128 a, __m128 b, int sae)

Synopsis

__m128 _mm_maskz_min_round_ss (__mmask8 k, __m128 a, __m128 b, int sae)
#include "immintrin.h"
Instruction: vminss xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

IF k[0] dst[31:0] := MIN(a[31:0], b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vminss
__m128 _mm_min_round_ss (__m128 a, __m128 b, int sae)

Synopsis

__m128 _mm_min_round_ss (__m128 a, __m128 b, int sae)
#include "immintrin.h"
Instruction: vminss xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper element from a to the upper element of dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.

Operation

dst[31:0] := MIN(a[31:0], b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vminsd
__m128d _mm_mask_min_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_min_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vminsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := MIN(a[63:0], b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vminsd
__m128d _mm_maskz_min_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_min_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vminsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := MIN(a[63:0], b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
minsd
__m128d _mm_min_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_min_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: minsd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := MIN(a[63:0], b[63:0]) dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell30.8
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vminss
__m128 _mm_mask_min_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_min_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vminss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[31:0] := MIN(a[31:0], b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vminss
__m128 _mm_maskz_min_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_min_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vminss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[31:0] := MIN(a[31:0], b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
minss
__m128 _mm_min_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_min_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: minss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[31:0] := MIN(a[31:0], b[31:0]) dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
phminposuw
__m128i _mm_minpos_epu16 (__m128i a)

Synopsis

__m128i _mm_minpos_epu16 (__m128i a)
#include "smmintrin.h"
Instruction: phminposuw xmm, xmm
CPUID Flags: SSE4.1

Description

Horizontally compute the minimum amongst the packed unsigned 16-bit integers in a, store the minimum and index in dst, and zero the remaining bits in dst.

Operation

index[2:0] := 0 min[15:0] := a[15:0] FOR j := 0 to 7 i := j*16 IF a[i+15:i] < min[15:0] index[2:0] := j min[15:0] := a[i+15:i] FI ENDFOR dst[15:0] := min[15:0] dst[18:16] := index[2:0] dst[127:19] := 0

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge51
Sandy Bridge51
Westmere31
Nehalem31
monitor
void _mm_monitor (void const* p, unsigned extensions, unsigned hints)

Synopsis

void _mm_monitor (void const* p, unsigned extensions, unsigned hints)
#include "pmmintrin.h"
Instruction: monitor
CPUID Flags: MONITOR

Description

Arm address monitoring hardware using the address specified in p. A store to an address within the specified address range triggers the monitoring hardware. Specify optional extensions in extensions, and optional hints in hints.
vmovdqu16
__m128i _mm_mask_mov_epi16 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_mov_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW

Description

Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vmovdqu16
__m128i _mm_maskz_mov_epi16 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_mov_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW

Description

Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovdqu16
__m256i _mm256_mask_mov_epi16 (__m256i src, __mmask16 k, __m256i a)

Synopsis

__m256i _mm256_mask_mov_epi16 (__m256i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW

Description

Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vmovdqu16
__m256i _mm256_maskz_mov_epi16 (__mmask16 k, __m256i a)

Synopsis

__m256i _mm256_maskz_mov_epi16 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW

Description

Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovdqu16
__m512i _mm512_mask_mov_epi16 (__m512i src, __mmask32 k, __m512i a)

Synopsis

__m512i _mm512_mask_mov_epi16 (__m512i src, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512BW

Description

Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vmovdqu16
__m512i _mm512_maskz_mov_epi16 (__mmask32 k, __m512i a)

Synopsis

__m512i _mm512_maskz_mov_epi16 (__mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512BW

Description

Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmovdqa32
__m128i _mm_mask_mov_epi32 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_mov_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F

Description

Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vmovdqa32
__m128i _mm_maskz_mov_epi32 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_mov_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F

Description

Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovdqa32
__m256i _mm256_mask_mov_epi32 (__m256i src, __mmask8 k, __m256i a)

Synopsis

__m256i _mm256_mask_mov_epi32 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F

Description

Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vmovdqa32
__m256i _mm256_maskz_mov_epi32 (__mmask8 k, __m256i a)

Synopsis

__m256i _mm256_maskz_mov_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F

Description

Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovdqa32
__m512i _mm512_mask_mov_epi32 (__m512i src, __mmask16 k, __m512i a)

Synopsis

__m512i _mm512_mask_mov_epi32 (__m512i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmovdqa32
__m512i _mm512_maskz_mov_epi32 (__mmask16 k, __m512i a)

Synopsis

__m512i _mm512_maskz_mov_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, zmm
CPUID Flags: AVX512F

Description

Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmovdqa64
__m128i _mm_mask_mov_epi64 (__m128i src, __mmask8 k, __m128i a)

Synopsis

__m128i _mm_mask_mov_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F

Description

Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vmovdqa64
__m128i _mm_maskz_mov_epi64 (__mmask8 k, __m128i a)

Synopsis

__m128i _mm_maskz_mov_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F

Description

Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovdqa64
__m256i _mm256_mask_mov_epi64 (__m256i src, __mmask8 k, __m256i a)

Synopsis

__m256i _mm256_mask_mov_epi64 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F

Description

Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vmovdqa64
__m256i _mm256_maskz_mov_epi64 (__mmask8 k, __m256i a)

Synopsis

__m256i _mm256_maskz_mov_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F

Description

Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovdqa64
__m512i _mm512_mask_mov_epi64 (__m512i src, __mmask8 k, __m512i a)

Synopsis

__m512i _mm512_mask_mov_epi64 (__m512i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa64 zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmovdqa64
__m512i _mm512_maskz_mov_epi64 (__mmask8 k, __m512i a)

Synopsis

__m512i _mm512_maskz_mov_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa64 zmm {k}, zmm
CPUID Flags: AVX512F

Description

Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmovdqu8
__m128i _mm_mask_mov_epi8 (__m128i src, __mmask16 k, __m128i a)

Synopsis

__m128i _mm_mask_mov_epi8 (__m128i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW

Description

Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vmovdqu8
__m128i _mm_maskz_mov_epi8 (__mmask16 k, __m128i a)

Synopsis

__m128i _mm_maskz_mov_epi8 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW

Description

Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovdqu8
__m256i _mm256_mask_mov_epi8 (__m256i src, __mmask32 k, __m256i a)

Synopsis

__m256i _mm256_mask_mov_epi8 (__m256i src, __mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW

Description

Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vmovdqu8
__m256i _mm256_maskz_mov_epi8 (__mmask32 k, __m256i a)

Synopsis

__m256i _mm256_maskz_mov_epi8 (__mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW

Description

Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovdqu8
__m512i _mm512_mask_mov_epi8 (__m512i src, __mmask64 k, __m512i a)

Synopsis

__m512i _mm512_mask_mov_epi8 (__m512i src, __mmask64 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512BW

Description

Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vmovdqu8
__m512i _mm512_maskz_mov_epi8 (__mmask64 k, __m512i a)

Synopsis

__m512i _mm512_maskz_mov_epi8 (__mmask64 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512BW

Description

Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmovapd
__m128d _mm_mask_mov_pd (__m128d src, __mmask8 k, __m128d a)

Synopsis

__m128d _mm_mask_mov_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F

Description

Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vmovapd
__m128d _mm_maskz_mov_pd (__mmask8 k, __m128d a)

Synopsis

__m128d _mm_maskz_mov_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F

Description

Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovapd
__m256d _mm256_mask_mov_pd (__m256d src, __mmask8 k, __m256d a)

Synopsis

__m256d _mm256_mask_mov_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F

Description

Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vmovapd
__m256d _mm256_maskz_mov_pd (__mmask8 k, __m256d a)

Synopsis

__m256d _mm256_maskz_mov_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F

Description

Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovapd
__m512d _mm512_mask_mov_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_mov_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmovapd
__m512d _mm512_maskz_mov_pd (__mmask8 k, __m512d a)

Synopsis

__m512d _mm512_maskz_mov_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmovaps
__m128 _mm_mask_mov_ps (__m128 src, __mmask8 k, __m128 a)

Synopsis

__m128 _mm_mask_mov_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F

Description

Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vmovaps
__m128 _mm_maskz_mov_ps (__mmask8 k, __m128 a)

Synopsis

__m128 _mm_maskz_mov_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F

Description

Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vmovaps
__m256 _mm256_mask_mov_ps (__m256 src, __mmask8 k, __m256 a)

Synopsis

__m256 _mm256_mask_mov_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F

Description

Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vmovaps
__m256 _mm256_maskz_mov_ps (__mmask8 k, __m256 a)

Synopsis

__m256 _mm256_maskz_mov_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F

Description

Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovaps
__m512 _mm512_mask_mov_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_mov_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmovaps
__m512 _mm512_maskz_mov_ps (__mmask16 k, __m512 a)

Synopsis

__m512 _mm512_maskz_mov_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
movq
__m128i _mm_move_epi64 (__m128i a)

Synopsis

__m128i _mm_move_epi64 (__m128i a)
#include "emmintrin.h"
Instruction: movq xmm, xmm
CPUID Flags: SSE2

Description

Copy the lower 64-bit integer in a to the lower element of dst, and zero the upper element.

Operation

dst[63:0] := a[63:0] dst[127:64] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vmovsd
__m128d _mm_mask_move_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_move_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmovsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vmovsd
__m128d _mm_maskz_move_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_move_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmovsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
movsd
__m128d _mm_move_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_move_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: movsd xmm, xmm, xmm
CPUID Flags: SSE2

Description

Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := b[63:0] dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
vmovss
__m128 _mm_mask_move_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_move_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmovss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vmovss
__m128 _mm_maskz_move_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_move_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmovss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
movss
__m128 _mm_move_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_move_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: movss xmm, xmm
CPUID Flags: SSE

Description

Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst, and copy the upper 3 elements from a to the upper elements of dst.

Operation

dst[31:0] := b[31:0] dst[63:32] := a[63:32] dst[95:64] := a[95:64] dst[127:96] := a[127:96]

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere10.33
Nehalem10.33
vmovddup
__m128d _mm_mask_movedup_pd (__m128d src, __mmask8 k, __m128d a)

Synopsis

__m128d _mm_mask_movedup_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovddup
CPUID Flags: AVX512VL + AVX512F

Description

Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vmovddup
__m128d _mm_maskz_movedup_pd (__mmask8 k, __m128d a)

Synopsis

__m128d _mm_maskz_movedup_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovddup
CPUID Flags: AVX512VL + AVX512F

Description

Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
movddup
__m128d _mm_movedup_pd (__m128d a)

Synopsis

__m128d _mm_movedup_pd (__m128d a)
#include "pmmintrin.h"
Instruction: movddup xmm, xmm
CPUID Flags: SSE3

Description

Duplicate the low double-precision (64-bit) floating-point element from a, and store the results in dst.

Operation

tmp[63:0] := a[63:0] tmp[127:64] := a[63:0]

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vmovddup
__m256d _mm256_mask_movedup_pd (__m256d src, __mmask8 k, __m256d a)

Synopsis

__m256d _mm256_mask_movedup_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vmovddup
CPUID Flags: AVX512VL + AVX512F

Description

Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] tmp[191:128] := a[191:128] tmp[255:192] := a[191:128] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vmovddup
__m256d _mm256_maskz_movedup_pd (__mmask8 k, __m256d a)

Synopsis

__m256d _mm256_maskz_movedup_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vmovddup
CPUID Flags: AVX512VL + AVX512F

Description

Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] tmp[191:128] := a[191:128] tmp[255:192] := a[191:128] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovddup
__m256d _mm256_movedup_pd (__m256d a)

Synopsis

__m256d _mm256_movedup_pd (__m256d a)
#include "immintrin.h"
Instruction: vmovddup ymm, ymm
CPUID Flags: AVX

Description

Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst.

Operation

dst[63:0] := a[63:0] dst[127:64] := a[63:0] dst[191:128] := a[191:128] dst[255:192] := a[191:128] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vmovddup
__m512d _mm512_mask_movedup_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_movedup_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vmovddup zmm {k}, zmm
CPUID Flags: AVX512F

Description

Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] tmp[191:128] := a[191:128] tmp[255:192] := a[191:128] tmp[319:256] := a[319:256] tmp[383:320] := a[319:256] tmp[447:384] := a[447:384] tmp[511:448] := a[447:384] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmovddup
__m512d _mm512_maskz_movedup_pd (__mmask8 k, __m512d a)

Synopsis

__m512d _mm512_maskz_movedup_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vmovddup zmm {k}, zmm
CPUID Flags: AVX512F

Description

Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] tmp[191:128] := a[191:128] tmp[255:192] := a[191:128] tmp[319:256] := a[319:256] tmp[383:320] := a[319:256] tmp[447:384] := a[447:384] tmp[511:448] := a[447:384] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmovddup
__m512d _mm512_movedup_pd (__m512d a)

Synopsis

__m512d _mm512_movedup_pd (__m512d a)
#include "immintrin.h"
Instruction: vmovddup zmm {k}, zmm
CPUID Flags: AVX512F

Description

Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst.

Operation

tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] tmp[191:128] := a[191:128] tmp[255:192] := a[191:128] tmp[319:256] := a[319:256] tmp[383:320] := a[319:256] tmp[447:384] := a[447:384] tmp[511:448] := a[447:384] dst[MAX:512] := 0
vmovshdup
__m128 _mm_mask_movehdup_ps (__m128 src, __mmask8 k, __m128 a)

Synopsis

__m128 _mm_mask_movehdup_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovshdup
CPUID Flags: AVX512VL + AVX512F

Description

Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[31:0] := a[63:32] tmp[63:32] := a[63:32] tmp[95:64] := a[127:96] tmp[127:96] := a[127:96] FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vmovshdup
__m128 _mm_maskz_movehdup_ps (__mmask8 k, __m128 a)

Synopsis

__m128 _mm_maskz_movehdup_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovshdup
CPUID Flags: AVX512VL + AVX512F

Description

Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[31:0] := a[63:32] tmp[63:32] := a[63:32] tmp[95:64] := a[127:96] tmp[127:96] := a[127:96] FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
movshdup
__m128 _mm_movehdup_ps (__m128 a)

Synopsis

__m128 _mm_movehdup_ps (__m128 a)
#include "pmmintrin.h"
Instruction: movshdup xmm, xmm
CPUID Flags: SSE3

Description

Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.

Operation

dst[31:0] := a[63:32] dst[63:32] := a[63:32] dst[95:64] := a[127:96] dst[127:96] := a[127:96]

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vmovshdup
__m256 _mm256_mask_movehdup_ps (__m256 src, __mmask8 k, __m256 a)

Synopsis

__m256 _mm256_mask_movehdup_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovshdup
CPUID Flags: AVX512VL + AVX512F

Description

Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[31:0] := a[63:32] tmp[63:32] := a[63:32] tmp[95:64] := a[127:96] tmp[127:96] := a[127:96] tmp[159:128] := a[191:160] tmp[191:160] := a[191:160] tmp[223:192] := a[255:224] tmp[255:224] := a[255:224] FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vmovshdup
__m256 _mm256_maskz_movehdup_ps (__mmask8 k, __m256 a)

Synopsis

__m256 _mm256_maskz_movehdup_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovshdup
CPUID Flags: AVX512VL + AVX512F

Description

Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[31:0] := a[63:32] tmp[63:32] := a[63:32] tmp[95:64] := a[127:96] tmp[127:96] := a[127:96] tmp[159:128] := a[191:160] tmp[191:160] := a[191:160] tmp[223:192] := a[255:224] tmp[255:224] := a[255:224] FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovshdup
__m256 _mm256_movehdup_ps (__m256 a)

Synopsis

__m256 _mm256_movehdup_ps (__m256 a)
#include "immintrin.h"
Instruction: vmovshdup ymm, ymm
CPUID Flags: AVX

Description

Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.

Operation

dst[31:0] := a[63:32] dst[63:32] := a[63:32] dst[95:64] := a[127:96] dst[127:96] := a[127:96] dst[159:128] := a[191:160] dst[191:160] := a[191:160] dst[223:192] := a[255:224] dst[255:224] := a[255:224] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vmovshdup
__m512 _mm512_mask_movehdup_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_movehdup_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovshdup zmm {k}, zmm
CPUID Flags: AVX512F

Description

Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[31:0] := a[63:32] tmp[63:32] := a[63:32] tmp[95:64] := a[127:96] tmp[127:96] := a[127:96] tmp[159:128] := a[191:160] tmp[191:160] := a[191:160] tmp[223:192] := a[255:224] tmp[255:224] := a[255:224] tmp[287:256] := a[319:288] tmp[319:288] := a[319:288] tmp[351:320] := a[383:352] tmp[383:352] := a[383:352] tmp[415:384] := a[447:416] tmp[447:416] := a[447:416] tmp[479:448] := a[511:480] tmp[511:480] := a[511:480] FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmovshdup
__m512 _mm512_maskz_movehdup_ps (__mmask16 k, __m512 a)

Synopsis

__m512 _mm512_maskz_movehdup_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovshdup zmm {k}, zmm
CPUID Flags: AVX512F

Description

Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[31:0] := a[63:32] tmp[63:32] := a[63:32] tmp[95:64] := a[127:96] tmp[127:96] := a[127:96] tmp[159:128] := a[191:160] tmp[191:160] := a[191:160] tmp[223:192] := a[255:224] tmp[255:224] := a[255:224] tmp[287:256] := a[319:288] tmp[319:288] := a[319:288] tmp[351:320] := a[383:352] tmp[383:352] := a[383:352] tmp[415:384] := a[447:416] tmp[447:416] := a[447:416] tmp[479:448] := a[511:480] tmp[511:480] := a[511:480] FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmovshdup
__m512 _mm512_movehdup_ps (__m512 a)

Synopsis

__m512 _mm512_movehdup_ps (__m512 a)
#include "immintrin.h"
Instruction: vmovshdup zmm {k}, zmm
CPUID Flags: AVX512F

Description

Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.

Operation

dst[31:0] := a[63:32] dst[63:32] := a[63:32] dst[95:64] := a[127:96] dst[127:96] := a[127:96] dst[159:128] := a[191:160] dst[191:160] := a[191:160] dst[223:192] := a[255:224] dst[255:224] := a[255:224] dst[287:256] := a[319:288] dst[319:288] := a[319:288] dst[351:320] := a[383:352] dst[383:352] := a[383:352] dst[415:384] := a[447:416] dst[447:416] := a[447:416] dst[479:448] := a[511:480] dst[511:480] := a[511:480] dst[MAX:512] := 0
movhlps
__m128 _mm_movehl_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_movehl_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: movhlps xmm, xmm
CPUID Flags: SSE

Description

Move the upper 2 single-precision (32-bit) floating-point elements from b to the lower 2 elements of dst, and copy the upper 2 elements from a to the upper 2 elements of dst.

Operation

dst[31:0] := b[95:64] dst[63:32] := b[127:96] dst[95:64] := a[95:64] dst[127:96] := a[127:96]

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vmovsldup
__m128 _mm_mask_moveldup_ps (__m128 src, __mmask8 k, __m128 a)

Synopsis

__m128 _mm_mask_moveldup_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovsldup
CPUID Flags: AVX512VL + AVX512F

Description

Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[31:0] := a[31:0] tmp[63:32] := a[31:0] tmp[95:64] := a[95:64] tmp[127:96] := a[95:64] FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vmovsldup
__m128 _mm_maskz_moveldup_ps (__mmask8 k, __m128 a)

Synopsis

__m128 _mm_maskz_moveldup_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovsldup
CPUID Flags: AVX512VL + AVX512F

Description

Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[31:0] := a[31:0] tmp[63:32] := a[31:0] tmp[95:64] := a[95:64] tmp[127:96] := a[95:64] FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
movsldup
__m128 _mm_moveldup_ps (__m128 a)

Synopsis

__m128 _mm_moveldup_ps (__m128 a)
#include "pmmintrin.h"
Instruction: movsldup xmm, xmm
CPUID Flags: SSE3

Description

Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.

Operation

dst[31:0] := a[31:0] dst[63:32] := a[31:0] dst[95:64] := a[95:64] dst[127:96] := a[95:64]

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vmovsldup
__m256 _mm256_mask_moveldup_ps (__m256 src, __mmask8 k, __m256 a)

Synopsis

__m256 _mm256_mask_moveldup_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovsldup
CPUID Flags: AVX512VL + AVX512F

Description

Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[31:0] := a[31:0] tmp[63:32] := a[31:0] tmp[95:64] := a[95:64] tmp[127:96] := a[95:64] tmp[159:128] := a[159:128] tmp[191:160] := a[159:128] tmp[223:192] := a[223:192] tmp[255:224] := a[223:192] FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vmovsldup
__m256 _mm256_maskz_moveldup_ps (__mmask8 k, __m256 a)

Synopsis

__m256 _mm256_maskz_moveldup_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovsldup
CPUID Flags: AVX512VL + AVX512F

Description

Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[31:0] := a[31:0] tmp[63:32] := a[31:0] tmp[95:64] := a[95:64] tmp[127:96] := a[95:64] tmp[159:128] := a[159:128] tmp[191:160] := a[159:128] tmp[223:192] := a[223:192] tmp[255:224] := a[223:192] FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmovsldup
__m256 _mm256_moveldup_ps (__m256 a)

Synopsis

__m256 _mm256_moveldup_ps (__m256 a)
#include "immintrin.h"
Instruction: vmovsldup ymm, ymm
CPUID Flags: AVX

Description

Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.

Operation

dst[31:0] := a[31:0] dst[63:32] := a[31:0] dst[95:64] := a[95:64] dst[127:96] := a[95:64] dst[159:128] := a[159:128] dst[191:160] := a[159:128] dst[223:192] := a[223:192] dst[255:224] := a[223:192] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vmovsldup
__m512 _mm512_mask_moveldup_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_moveldup_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovsldup zmm {k}, zmm
CPUID Flags: AVX512F

Description

Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp[31:0] := a[31:0] tmp[63:32] := a[31:0] tmp[95:64] := a[95:64] tmp[127:96] := a[95:64] tmp[159:128] := a[159:128] tmp[191:160] := a[159:128] tmp[223:192] := a[223:192] tmp[255:224] := a[223:192] tmp[287:256] := a[287:256] tmp[319:288] := a[287:256] tmp[351:320] := a[351:320] tmp[383:352] := a[351:320] tmp[415:384] := a[415:384] tmp[447:416] := a[415:384] tmp[479:448] := a[479:448] tmp[511:480] := a[479:448] FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmovsldup
__m512 _mm512_maskz_moveldup_ps (__mmask16 k, __m512 a)

Synopsis

__m512 _mm512_maskz_moveldup_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovsldup zmm {k}, zmm
CPUID Flags: AVX512F

Description

Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp[31:0] := a[31:0] tmp[63:32] := a[31:0] tmp[95:64] := a[95:64] tmp[127:96] := a[95:64] tmp[159:128] := a[159:128] tmp[191:160] := a[159:128] tmp[223:192] := a[223:192] tmp[255:224] := a[223:192] tmp[287:256] := a[287:256] tmp[319:288] := a[287:256] tmp[351:320] := a[351:320] tmp[383:352] := a[351:320] tmp[415:384] := a[415:384] tmp[447:416] := a[415:384] tmp[479:448] := a[479:448] tmp[511:480] := a[479:448] FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmovsldup
__m512 _mm512_moveldup_ps (__m512 a)

Synopsis

__m512 _mm512_moveldup_ps (__m512 a)
#include "immintrin.h"
Instruction: vmovsldup zmm {k}, zmm
CPUID Flags: AVX512F

Description

Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.

Operation

dst[31:0] := a[31:0] dst[63:32] := a[31:0] dst[95:64] := a[95:64] dst[127:96] := a[95:64] dst[159:128] := a[159:128] dst[191:160] := a[159:128] dst[223:192] := a[223:192] dst[255:224] := a[223:192] dst[287:256] := a[287:256] dst[319:288] := a[287:256] dst[351:320] := a[351:320] dst[383:352] := a[351:320] dst[415:384] := a[415:384] dst[447:416] := a[415:384] dst[479:448] := a[479:448] dst[511:480] := a[479:448] dst[MAX:512] := 0
movlhps
__m128 _mm_movelh_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_movelh_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: movlhps xmm, xmm
CPUID Flags: SSE

Description

Move the lower 2 single-precision (32-bit) floating-point elements from b to the upper 2 elements of dst, and copy the lower 2 elements from a to the lower 2 elements of dst.

Operation

dst[31:0] := a[31:0] dst[63:32] := a[63:32] dst[95:64] := b[31:0] dst[127:96] := b[63:32]

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
pmovmskb
int _mm_movemask_epi8 (__m128i a)

Synopsis

int _mm_movemask_epi8 (__m128i a)
#include "emmintrin.h"
Instruction: pmovmskb r32, xmm
CPUID Flags: SSE2

Description

Create mask from the most significant bit of each 8-bit element in a, and store the result in dst.

Operation

FOR j := 0 to 15 i := j*8 dst[j] := a[i+7] ENDFOR dst[MAX:16] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
vpmovmskb
int _mm256_movemask_epi8 (__m256i a)

Synopsis

int _mm256_movemask_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovmskb r32, ymm
CPUID Flags: AVX2

Description

Create mask from the most significant bit of each 8-bit element in a, and store the result in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[j] := a[i+7] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
movmskpd
int _mm_movemask_pd (__m128d a)

Synopsis

int _mm_movemask_pd (__m128d a)
#include "emmintrin.h"
Instruction: movmskpd r32, xmm
CPUID Flags: SSE2

Description

Set each bit of mask dst based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in a.

Operation

FOR j := 0 to 1 i := j*64 IF a[i+63] dst[j] := 1 ELSE dst[j] := 0 FI ENDFOR dst[MAX:2] := 0
vmovmskpd
int _mm256_movemask_pd (__m256d a)

Synopsis

int _mm256_movemask_pd (__m256d a)
#include "immintrin.h"
Instruction: vmovmskpd r32, ymm
CPUID Flags: AVX

Description

Set each bit of mask dst based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in a.

Operation

FOR j := 0 to 3 i := j*64 IF a[i+63] dst[j] := 1 ELSE dst[j] := 0 FI ENDFOR dst[MAX:4] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge2-
Sandy Bridge2-
pmovmskb
int _mm_movemask_pi8 (__m64 a)

Synopsis

int _mm_movemask_pi8 (__m64 a)
#include "xmmintrin.h"
Instruction: pmovmskb r32, mm
CPUID Flags: SSE

Description

Create mask from the most significant bit of each 8-bit element in a, and store the result in dst.

Operation

FOR j := 0 to 7 i := j*8 dst[j] := a[i+7] ENDFOR dst[MAX:8] := 0
movmskps
int _mm_movemask_ps (__m128 a)

Synopsis

int _mm_movemask_ps (__m128 a)
#include "xmmintrin.h"
Instruction: movmskps r32, xmm
CPUID Flags: SSE

Description

Set each bit of mask dst based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in a.

Operation

FOR j := 0 to 3 i := j*32 IF a[i+31] dst[j] := 1 ELSE dst[j] := 0 FI ENDFOR dst[MAX:4] := 0
vmovmskps
int _mm256_movemask_ps (__m256 a)

Synopsis

int _mm256_movemask_ps (__m256 a)
#include "immintrin.h"
Instruction: vmovmskps r32, ymm
CPUID Flags: AVX

Description

Set each bit of mask dst based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in a.

Operation

FOR j := 0 to 7 i := j*32 IF a[i+31] dst[j] := 1 ELSE dst[j] := 0 FI ENDFOR dst[MAX:8] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge2-
Sandy Bridge2-
vpmovw2m
__mmask8 _mm_movepi16_mask (__m128i a)

Synopsis

__mmask8 _mm_movepi16_mask (__m128i a)
#include "immintrin.h"
Instruction: vpmovw2m
CPUID Flags: AVX512VL + AVX512BW

Description

Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.

Operation

FOR j := 0 to 7 i := j*16 IF a[i+15] k[j] := 1 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpmovw2m
__mmask16 _mm256_movepi16_mask (__m256i a)

Synopsis

__mmask16 _mm256_movepi16_mask (__m256i a)
#include "immintrin.h"
Instruction: vpmovw2m
CPUID Flags: AVX512VL + AVX512BW

Description

Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.

Operation

FOR j := 0 to 15 i := j*16 IF a[i+15] k[j] := 1 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpmovw2m
__mmask32 _mm512_movepi16_mask (__m512i a)

Synopsis

__mmask32 _mm512_movepi16_mask (__m512i a)
#include "immintrin.h"
Instruction: vpmovw2m
CPUID Flags: AVX512BW

Description

Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.

Operation

FOR j := 0 to 31 i := j*16 IF a[i+15] k[j] := 1 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpmovd2m
__mmask8 _mm_movepi32_mask (__m128i a)

Synopsis

__mmask8 _mm_movepi32_mask (__m128i a)
#include "immintrin.h"
Instruction: vpmovd2m
CPUID Flags: AVX512VL + AVX512DQ

Description

Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit integer in a.

Operation

FOR j := 0 to 3 i := j*32 IF a[i+31] k[j] := 1 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpmovd2m
__mmask8 _mm256_movepi32_mask (__m256i a)

Synopsis

__mmask8 _mm256_movepi32_mask (__m256i a)
#include "immintrin.h"
Instruction: vpmovd2m
CPUID Flags: AVX512VL + AVX512DQ

Description

Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit integer in a.

Operation

FOR j := 0 to 7 i := j*32 IF a[i+31] k[j] := 1 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vpmovd2m
__mmask16 _mm512_movepi32_mask (__m512i a)

Synopsis

__mmask16 _mm512_movepi32_mask (__m512i a)
#include "immintrin.h"
Instruction: vpmovd2m
CPUID Flags: AVX512DQ

Description

Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit integer in a.

Operation

FOR j := 0 to 15 i := j*32 IF a[i+31] k[j] := 1 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpmovq2m
__mmask8 _mm_movepi64_mask (__m128i a)

Synopsis

__mmask8 _mm_movepi64_mask (__m128i a)
#include "immintrin.h"
Instruction: vpmovq2m
CPUID Flags: AVX512VL + AVX512DQ

Description

Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit integer in a.

Operation

FOR j := 0 to 1 i := j*64 IF a[i+63] k[j] := 1 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vpmovq2m
__mmask8 _mm256_movepi64_mask (__m256i a)

Synopsis

__mmask8 _mm256_movepi64_mask (__m256i a)
#include "immintrin.h"
Instruction: vpmovq2m
CPUID Flags: AVX512VL + AVX512DQ

Description

Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit integer in a.

Operation

FOR j := 0 to 3 i := j*64 IF a[i+63] k[j] := 1 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vpmovq2m
__mmask8 _mm512_movepi64_mask (__m512i a)

Synopsis

__mmask8 _mm512_movepi64_mask (__m512i a)
#include "immintrin.h"
Instruction: vpmovq2m
CPUID Flags: AVX512DQ

Description

Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit integer in a.

Operation

FOR j := 0 to 7 i := j*64 IF a[i+63] k[j] := 1 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
movdq2q
__m64 _mm_movepi64_pi64 (__m128i a)

Synopsis

__m64 _mm_movepi64_pi64 (__m128i a)
#include "emmintrin.h"
Instruction: movdq2q mm, xmm
CPUID Flags: SSE2

Description

Copy the lower 64-bit integer in a to dst.

Operation

dst[63:0] := a[63:0]
vpmovb2m
__mmask16 _mm_movepi8_mask (__m128i a)

Synopsis

__mmask16 _mm_movepi8_mask (__m128i a)
#include "immintrin.h"
Instruction: vpmovb2m
CPUID Flags: AVX512VL + AVX512BW

Description

Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.

Operation

FOR j := 0 to 15 i := j*8 IF a[i+7] k[j] := 1 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vpmovb2m
__mmask32 _mm256_movepi8_mask (__m256i a)

Synopsis

__mmask32 _mm256_movepi8_mask (__m256i a)
#include "immintrin.h"
Instruction: vpmovb2m
CPUID Flags: AVX512VL + AVX512BW

Description

Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.

Operation

FOR j := 0 to 31 i := j*8 IF a[i+7] k[j] := 1 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vpmovb2m
__mmask64 _mm512_movepi8_mask (__m512i a)

Synopsis

__mmask64 _mm512_movepi8_mask (__m512i a)
#include "immintrin.h"
Instruction: vpmovb2m
CPUID Flags: AVX512BW

Description

Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.

Operation

FOR j := 0 to 63 i := j*8 IF a[i+7] k[j] := 1 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
vpmovm2w
__m128i _mm_movm_epi16 (__mmask8 k)

Synopsis

__m128i _mm_movm_epi16 (__mmask8 k)
#include "immintrin.h"
Instruction: vpmovm2w
CPUID Flags: AVX512VL + AVX512BW

Description

Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := 0xFFFF ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovm2w
__m256i _mm256_movm_epi16 (__mmask16 k)

Synopsis

__m256i _mm256_movm_epi16 (__mmask16 k)
#include "immintrin.h"
Instruction: vpmovm2w
CPUID Flags: AVX512VL + AVX512BW

Description

Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := 0xFFFF ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovm2w
__m512i _mm512_movm_epi16 (__mmask32 k)

Synopsis

__m512i _mm512_movm_epi16 (__mmask32 k)
#include "immintrin.h"
Instruction: vpmovm2w
CPUID Flags: AVX512BW

Description

Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := 0xFFFF ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmovm2d
__m128i _mm_movm_epi32 (__mmask8 k)

Synopsis

__m128i _mm_movm_epi32 (__mmask8 k)
#include "immintrin.h"
Instruction: vpmovm2d
CPUID Flags: AVX512VL + AVX512DQ

Description

Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := 0xFFFFFFFF ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovm2d
__m256i _mm256_movm_epi32 (__mmask8 k)

Synopsis

__m256i _mm256_movm_epi32 (__mmask8 k)
#include "immintrin.h"
Instruction: vpmovm2d
CPUID Flags: AVX512VL + AVX512DQ

Description

Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := 0xFFFFFFFF ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovm2d
__m512i _mm512_movm_epi32 (__mmask16 k)

Synopsis

__m512i _mm512_movm_epi32 (__mmask16 k)
#include "immintrin.h"
Instruction: vpmovm2d
CPUID Flags: AVX512DQ

Description

Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := 0xFFFFFFFF ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmovm2q
__m128i _mm_movm_epi64 (__mmask8 k)

Synopsis

__m128i _mm_movm_epi64 (__mmask8 k)
#include "immintrin.h"
Instruction: vpmovm2q
CPUID Flags: AVX512VL + AVX512DQ

Description

Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := 0xFFFFFFFFffffffff ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovm2q
__m256i _mm256_movm_epi64 (__mmask8 k)

Synopsis

__m256i _mm256_movm_epi64 (__mmask8 k)
#include "immintrin.h"
Instruction: vpmovm2q
CPUID Flags: AVX512VL + AVX512DQ

Description

Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := 0xFFFFFFFFffffffff ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovm2q
__m512i _mm512_movm_epi64 (__mmask8 k)

Synopsis

__m512i _mm512_movm_epi64 (__mmask8 k)
#include "immintrin.h"
Instruction: vpmovm2q
CPUID Flags: AVX512DQ

Description

Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := 0xFFFFFFFFffffffff ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmovm2b
__m128i _mm_movm_epi8 (__mmask16 k)

Synopsis

__m128i _mm_movm_epi8 (__mmask16 k)
#include "immintrin.h"
Instruction: vpmovm2b
CPUID Flags: AVX512BW + AVX512VL

Description

Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := 0xFF ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmovm2b
__m256i _mm256_movm_epi8 (__mmask32 k)

Synopsis

__m256i _mm256_movm_epi8 (__mmask32 k)
#include "immintrin.h"
Instruction: vpmovm2b
CPUID Flags: AVX512VL + AVX512BW

Description

Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := 0xFF ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmovm2b
__m512i _mm512_movm_epi8 (__mmask64 k)

Synopsis

__m512i _mm512_movm_epi8 (__mmask64 k)
#include "immintrin.h"
Instruction: vpmovm2b
CPUID Flags: AVX512BW

Description

Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := 0xFF ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
movq2dq
__m128i _mm_movpi64_epi64 (__m64 a)

Synopsis

__m128i _mm_movpi64_epi64 (__m64 a)
#include "emmintrin.h"
Instruction: movq2dq xmm, mm
CPUID Flags: SSE2

Description

Copy the 64-bit integer a to the lower element of dst, and zero the upper element.

Operation

dst[63:0] := a[63:0] dst[127:64] := 0
mpsadbw
__m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8)

Synopsis

__m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8)
#include "smmintrin.h"
Instruction: mpsadbw xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Eight SADs are performed using one quadruplet from b and eight quadruplets from a. One quadruplet is selected from b starting at on the offset specified in imm8. Eight quadruplets are formed from sequential 8-bit integers selected from a starting at the offset specified in imm8.

Operation

MPSADBW(a[127:0], b[127:0], imm8[2:0]) { a_offset := imm8[2]*32 b_offset := imm8[1:0]*32 FOR j := 0 to 7 i := j*8 k := a_offset+i l := b_offset tmp[i+15:i] := ABS(a[k+7:k] - b[l+7:l]) + ABS(a[k+15:k+8] - b[l+15:l+8]) + ABS(a[k+23:k+16] - b[l+23:l+16]) + ABS(a[k+31:k+24] - b[l+31:l+24]) ENDFOR RETURN tmp[127:0] } dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0])

Performance

ArchitectureLatencyThroughput
Haswell72
Ivy Bridge51
Sandy Bridge51
Westmere51
Nehalem51
vmpsadbw
__m256i _mm256_mpsadbw_epu8 (__m256i a, __m256i b, const int imm8)

Synopsis

__m256i _mm256_mpsadbw_epu8 (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vmpsadbw ymm, ymm, ymm, imm
CPUID Flags: AVX2

Description

Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Eight SADs are performed for each 128-bit lane using one quadruplet from b and eight quadruplets from a. One quadruplet is selected from b starting at on the offset specified in imm8. Eight quadruplets are formed from sequential 8-bit integers selected from a starting at the offset specified in imm8.

Operation

MPSADBW(a[127:0], b[127:0], imm8[2:0]) { i := imm8[2]*32 b_offset := imm8[1:0]*32 FOR j := 0 to 7 i := j*8 k := a_offset+i l := b_offset tmp[i+15:i] := ABS(a[k+7:k] - b[l+7:l]) + ABS(a[k+15:k+8] - b[l+15:l+8]) + ABS(a[k+23:k+16] - b[l+23:l+16]) + ABS(a[k+31:k+24] - b[l+31:l+24]) ENDFOR RETURN tmp[127:0] } dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0]) dst[255:128] := MPSADBW(a[255:128], b[255:128], imm8[5:3]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell72
vpmuldq
__m128i _mm_mask_mul_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_mul_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmuldq
CPUID Flags: AVX512VL + AVX512F

Description

Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmuldq
__m128i _mm_maskz_mul_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_mul_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmuldq
CPUID Flags: AVX512VL + AVX512F

Description

Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
pmuldq
__m128i _mm_mul_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_mul_epi32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pmuldq xmm, xmm
CPUID Flags: SSE4.1

Description

Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[i+31:i] * b[i+31:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vpmuldq
__m256i _mm256_mask_mul_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_mul_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmuldq
CPUID Flags: AVX512VL + AVX512F

Description

Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmuldq
__m256i _mm256_maskz_mul_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_mul_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmuldq
CPUID Flags: AVX512VL + AVX512F

Description

Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmuldq
__m256i _mm256_mul_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_mul_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmuldq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell51
vpmuldq
__m512i _mm512_mask_mul_epi32 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_mul_epi32 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmuldq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmuldq
__m512i _mm512_maskz_mul_epi32 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_mul_epi32 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmuldq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmuldq
__m512i _mm512_mul_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_mul_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmuldq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:512] := 0
vpmuludq
__m128i _mm_mask_mul_epu32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_mul_epu32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmuludq
CPUID Flags: AVX512VL + AVX512F

Description

Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmuludq
__m128i _mm_maskz_mul_epu32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_mul_epu32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmuludq
CPUID Flags: AVX512VL + AVX512F

Description

Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
pmuludq
__m128i _mm_mul_epu32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_mul_epu32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmuludq xmm, xmm
CPUID Flags: SSE2

Description

Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[i+31:i] * b[i+31:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vpmuludq
__m256i _mm256_mask_mul_epu32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_mul_epu32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmuludq
CPUID Flags: AVX512VL + AVX512F

Description

Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmuludq
__m256i _mm256_maskz_mul_epu32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_mul_epu32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmuludq
CPUID Flags: AVX512VL + AVX512F

Description

Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmuludq
__m256i _mm256_mul_epu32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_mul_epu32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmuludq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell51
vpmuludq
__m512i _mm512_mask_mul_epu32 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_mul_epu32 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmuludq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmuludq
__m512i _mm512_maskz_mul_epu32 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_mul_epu32 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmuludq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmuludq
__m512i _mm512_mul_epu32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_mul_epu32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmuludq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:512] := 0
vmulpd
__m128d _mm_mask_mul_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_mul_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmulpd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vmulpd
__m128d _mm_maskz_mul_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_mul_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmulpd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
mulpd
__m128d _mm_mul_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_mul_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: mulpd xmm, xmm
CPUID Flags: SSE2

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[i+63:i] * b[i+63:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell50.5
Ivy Bridge51
Sandy Bridge51
Westmere51
Nehalem51
vmulpd
__m256d _mm256_mask_mul_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_mul_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vmulpd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vmulpd
__m256d _mm256_maskz_mul_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_mul_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vmulpd
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmulpd
__m256d _mm256_mul_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_mul_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vmulpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[i+63:i] * b[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
Ivy Bridge51
Sandy Bridge51
vmulpd
__m512d _mm512_mask_mul_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_mul_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vmulpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). RM.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmulpd
__m512d _mm512_maskz_mul_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_mul_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vmulpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmulpd
__m512d _mm512_mul_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_mul_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vmulpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] * b[i+63:i] ENDFOR dst[MAX:512] := 0
vmulps
__m128 _mm_mask_mul_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_mul_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmulps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). RM.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vmulps
__m128 _mm_maskz_mul_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_mul_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmulps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
mulps
__m128 _mm_mul_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_mul_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: mulps xmm, xmm
CPUID Flags: SSE

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := a[i+31:i] * b[i+31:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell50.5
Ivy Bridge51
Sandy Bridge51
Westmere41
Nehalem41
vmulps
__m256 _mm256_mask_mul_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_mul_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vmulps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). RM.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vmulps
__m256 _mm256_maskz_mul_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_mul_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vmulps
CPUID Flags: AVX512VL + AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vmulps
__m256 _mm256_mul_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_mul_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vmulps ymm, ymm, ymm
CPUID Flags: AVX

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell50.5
Ivy Bridge51
Sandy Bridge51
vmulps
__m512 _mm512_mask_mul_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_mul_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vmulps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). RM.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmulps
__m512 _mm512_maskz_mul_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_mul_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vmulps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmulps
__m512 _mm512_mul_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_mul_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vmulps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:512] := 0
vmulpd
__m512d _mm512_mask_mul_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_mask_mul_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vmulpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vmulpd
__m512d _mm512_maskz_mul_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_maskz_mul_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vmulpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmulpd
__m512d _mm512_mul_round_pd (__m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_mul_round_pd (__m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vmulpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] * b[i+63:i] ENDFOR dst[MAX:512] := 0
vmulps
__m512 _mm512_mask_mul_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_mask_mul_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vmulps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vmulps
__m512 _mm512_maskz_mul_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_maskz_mul_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vmulps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vmulps
__m512 _mm512_mul_round_ps (__m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_mul_round_ps (__m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vmulps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:512] := 0
vmulsd
__m128d _mm_mask_mul_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_mask_mul_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vmulsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := a[63:0] * b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vmulsd
__m128d _mm_maskz_mul_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_maskz_mul_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vmulsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := a[63:0] * b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vmulsd
__m128d _mm_mul_round_sd (__m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_mul_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vmulsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := a[63:0] * b[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0
vmulss
__m128 _mm_mask_mul_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_mask_mul_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vmulss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := a[31:0] * b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vmulss
__m128 _mm_maskz_mul_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_maskz_mul_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vmulss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := a[31:0] * b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vmulss
__m128 _mm_mul_round_ss (__m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_mul_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vmulss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := a[31:0] * b[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0
vmulsd
__m128d _mm_mask_mul_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_mul_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmulsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := a[63:0] * b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vmulsd
__m128d _mm_maskz_mul_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_mul_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmulsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := a[63:0] * b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
mulsd
__m128d _mm_mul_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_mul_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: mulsd xmm, xmm
CPUID Flags: SSE2

Description

Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := a[63:0] * b[63:0] dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell50.5
Ivy Bridge51
Sandy Bridge51
Westmere51
Nehalem51
vmulss
__m128 _mm_mask_mul_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_mul_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmulss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := a[31:0] * b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vmulss
__m128 _mm_maskz_mul_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_mul_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmulss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := a[31:0] * b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
mulss
__m128 _mm_mul_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_mul_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: mulss xmm, xmm
CPUID Flags: SSE

Description

Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := a[31:0] * b[31:0] dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell50.5
Ivy Bridge51
Sandy Bridge51
Westmere41
Nehalem41
pmuludq
__m64 _mm_mul_su32 (__m64 a, __m64 b)

Synopsis

__m64 _mm_mul_su32 (__m64 a, __m64 b)
#include "emmintrin.h"
Instruction: pmuludq mm, mm
CPUID Flags: SSE2

Description

Multiply the low unsigned 32-bit integers from a and b, and store the unsigned 64-bit result in dst.

Operation

dst[63:0] := a[31:0] * b[31:0]

Performance

ArchitectureLatencyThroughput
Haswell5-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
vpmulhw
__m128i _mm_mask_mulhi_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_mulhi_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpmulhw
__m128i _mm_maskz_mulhi_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_mulhi_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
pmulhw
__m128i _mm_mulhi_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_mulhi_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmulhw xmm, xmm
CPUID Flags: SSE2

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 7 i := j*16 tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vpmulhw
__m256i _mm256_mask_mulhi_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_mulhi_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpmulhw
__m256i _mm256_maskz_mulhi_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_mulhi_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmulhw
__m256i _mm256_mulhi_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_mulhi_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 15 i := j*16 tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell51
vpmulhw
__m512i _mm512_mask_mulhi_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_mulhi_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpmulhw
__m512i _mm512_maskz_mulhi_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_mulhi_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmulhw
__m512i _mm512_mulhi_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_mulhi_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 31 i := j*16 tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ENDFOR dst[MAX:512] := 0
vpmulhd
__m512i _mm512_mask_mulhi_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_mulhi_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhd zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Performs element-by-element multiplication between packed 32-bit integer elements in a and b and stores the high 32 bits of each result into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmulhd
__m512i _mm512_mulhi_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_mulhi_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhd zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Performs element-by-element multiplication between packed 32-bit integer elements in a and b and stores the high 32 bits of each result into dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32 ENDFOR dst[MAX:512] := 0
vpmulhuw
__m128i _mm_mask_mulhi_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_mulhi_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpmulhuw
__m128i _mm_maskz_mulhi_epu16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_mulhi_epu16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := o FI ENDFOR dst[MAX:128] := 0
pmulhuw
__m128i _mm_mulhi_epu16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_mulhi_epu16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmulhuw xmm, xmm
CPUID Flags: SSE2

Description

Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 7 i := j*16 tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vpmulhuw
__m256i _mm256_mask_mulhi_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_mulhi_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpmulhuw
__m256i _mm256_maskz_mulhi_epu16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_mulhi_epu16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := o FI ENDFOR dst[MAX:256] := 0
vpmulhuw
__m256i _mm256_mulhi_epu16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_mulhi_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhuw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 15 i := j*16 tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell5-
vpmulhuw
__m512i _mm512_mask_mulhi_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_mulhi_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512BW

Description

Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpmulhuw
__m512i _mm512_maskz_mulhi_epu16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_mulhi_epu16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512BW

Description

Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := o FI ENDFOR dst[MAX:512] := 0
vpmulhuw
__m512i _mm512_mulhi_epu16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_mulhi_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512BW

Description

Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 31 i := j*16 tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ENDFOR dst[MAX:512] := 0
vpmulhud
__m512i _mm512_mask_mulhi_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_mulhi_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhud zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Performs element-by-element multiplication between packed unsigned 32-bit integer elements in a and b and stores the high 32 bits of each result into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmulhud
__m512i _mm512_mulhi_epu32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_mulhi_epu32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhud zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Performs element-by-element multiplication between packed unsigned 32-bit integer elements in a and b and stores the high 32 bits of each result into dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32 ENDFOR dst[MAX:512] := 0
pmulhuw
__m64 _mm_mulhi_pu16 (__m64 a, __m64 b)

Synopsis

__m64 _mm_mulhi_pu16 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pmulhuw mm, mm
CPUID Flags: SSE

Description

Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 3 i := j*16 tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ENDFOR
vpmulhrsw
__m128i _mm_mask_mulhrs_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_mulhrs_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1 dst[i+15:i] := tmp[16:1] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpmulhrsw
__m128i _mm_maskz_mulhrs_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_mulhrs_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1 dst[i+15:i] := tmp[16:1] ELSE dst[i+15:i] := 9 FI ENDFOR dst[MAX:128] := 0
pmulhrsw
__m128i _mm_mulhrs_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_mulhrs_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: pmulhrsw xmm, xmm
CPUID Flags: SSSE3

Description

Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst.

Operation

FOR j := 0 to 7 i := j*16 tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1 dst[i+15:i] := tmp[16:1] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vpmulhrsw
__m256i _mm256_mask_mulhrs_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_mulhrs_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1 dst[i+15:i] := tmp[16:1] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpmulhrsw
__m256i _mm256_maskz_mulhrs_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_mulhrs_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1 dst[i+15:i] := tmp[16:1] ELSE dst[i+15:i] := 9 FI ENDFOR dst[MAX:256] := 0
vpmulhrsw
__m256i _mm256_mulhrs_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_mulhrs_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhrsw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst.

Operation

FOR j := 0 to 15 i := j*16 tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1 dst[i+15:i] := tmp[16:1] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell51
vpmulhrsw
__m512i _mm512_mask_mulhrs_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_mulhrs_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1 dst[i+15:i] := tmp[16:1] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpmulhrsw
__m512i _mm512_maskz_mulhrs_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_mulhrs_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1 dst[i+15:i] := tmp[16:1] ELSE dst[i+15:i] := 9 FI ENDFOR dst[MAX:512] := 0
vpmulhrsw
__m512i _mm512_mulhrs_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_mulhrs_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512BW

Description

Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst.

Operation

FOR j := 0 to 31 i := j*16 tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1 dst[i+15:i] := tmp[16:1] ENDFOR dst[MAX:512] := 0
pmulhrsw
__m64 _mm_mulhrs_pi16 (__m64 a, __m64 b)

Synopsis

__m64 _mm_mulhrs_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: pmulhrsw mm, mm
CPUID Flags: SSSE3

Description

Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst.

Operation

FOR j := 0 to 3 i := j*16 tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1 dst[i+15:i] := tmp[16:1] ENDFOR
vpmullw
__m128i _mm_mask_mullo_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_mullo_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[15:0] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpmullw
__m128i _mm_maskz_mullo_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_mullo_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[15:0] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
pmullw
__m128i _mm_mullo_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_mullo_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmullw xmm, xmm
CPUID Flags: SSE2

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 7 i := j*16 tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[15:0] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vpmullw
__m256i _mm256_mask_mullo_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_mullo_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[15:0] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpmullw
__m256i _mm256_maskz_mullo_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_mullo_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512VL + AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[15:0] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmullw
__m256i _mm256_mullo_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_mullo_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmullw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 15 i := j*16 tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[15:0] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell51
vpmullw
__m512i _mm512_mask_mullo_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_mullo_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[15:0] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpmullw
__m512i _mm512_maskz_mullo_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_mullo_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[15:0] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmullw
__m512i _mm512_mullo_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_mullo_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512BW

Description

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 31 i := j*16 tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[15:0] ENDFOR dst[MAX:512] := 0
vpmulld
__m128i _mm_mask_mullo_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_mullo_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulld
CPUID Flags: AVX512VL + AVX512F

Description

Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpmulld
__m128i _mm_maskz_mullo_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_mullo_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulld
CPUID Flags: AVX512VL + AVX512F

Description

Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
pmulld
__m128i _mm_mullo_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_mullo_epi32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pmulld xmm, xmm
CPUID Flags: SSE4.1

Description

Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 3 i := j*32 tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell101
Ivy Bridge31
Sandy Bridge31
Westmere62
Nehalem62
vpmulld
__m256i _mm256_mask_mullo_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_mullo_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulld
CPUID Flags: AVX512VL + AVX512F

Description

Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpmulld
__m256i _mm256_maskz_mullo_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_mullo_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulld
CPUID Flags: AVX512VL + AVX512F

Description

Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmulld
__m256i _mm256_mullo_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_mullo_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulld ymm, ymm, ymm
CPUID Flags: AVX2

Description

Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 7 i := j*32 tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell101
vpmulld
__m512i _mm512_mask_mullo_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_mullo_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulld zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpmulld
__m512i _mm512_maskz_mullo_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_mullo_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulld
CPUID Flags: AVX512F

Description

Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmulld
__m512i _mm512_mullo_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_mullo_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulld zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 15 i := j*32 tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ENDFOR dst[MAX:512] := 0
vpmullq
__m128i _mm_mask_mullo_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_mullo_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512VL + AVX512DQ

Description

Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] tmp[127:0] := a[i+63:i] * b[i+63:i] dst[i+63:i] := tmp[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpmullq
__m128i _mm_maskz_mullo_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_mullo_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512VL + AVX512DQ

Description

Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] tmp[127:0] := a[i+63:i] * b[i+63:i] dst[i+63:i] := tmp[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpmullq
__m128i _mm_mullo_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_mullo_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512VL + AVX512DQ

Description

Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 1 i := j*64 tmp[127:0] := a[i+63:i] * b[i+63:i] dst[i+63:i] := tmp[63:0] ENDFOR dst[MAX:128] := 0
vpmullq
__m256i _mm256_mask_mullo_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_mullo_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512VL + AVX512DQ

Description

Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] tmp[127:0] := a[i+63:i] * b[i+63:i] dst[i+63:i] := tmp[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpmullq
__m256i _mm256_maskz_mullo_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_mullo_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512VL + AVX512DQ

Description

Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] tmp[127:0] := a[i+63:i] * b[i+63:i] dst[i+63:i] := tmp[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpmullq
__m256i _mm256_mullo_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_mullo_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512VL + AVX512DQ

Description

Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 3 i := j*64 tmp[127:0] := a[i+63:i] * b[i+63:i] dst[i+63:i] := tmp[63:0] ENDFOR dst[MAX:256] := 0
vpmullq
__m512i _mm512_mask_mullo_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_mullo_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512DQ

Description

Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] tmp[127:0] := a[i+63:i] * b[i+63:i] dst[i+63:i] := tmp[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpmullq
__m512i _mm512_maskz_mullo_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_mullo_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512DQ

Description

Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] tmp[127:0] := a[i+63:i] * b[i+63:i] dst[i+63:i] := tmp[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpmullq
__m512i _mm512_mullo_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_mullo_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512DQ

Description

Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 7 i := j*64 tmp[127:0] := a[i+63:i] * b[i+63:i] dst[i+63:i] := tmp[63:0] ENDFOR dst[MAX:512] := 0
...
__m512i _mm512_mask_mullox_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_mullox_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512i _mm512_mullox_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_mullox_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] * b[i+63:i] ENDFOR dst[MAX:512] := 0
vpmultishiftqb
__m128i _mm_mask_multishift_epi64_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_multishift_epi64_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI + AVX512VL

Description

For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR i := 0 to 1 q := i * 64 FOR j := 0 to 7 tmp8 := 0 ctrl := a[q+j*8+7:q+j*8] & 63 FOR l := 0 to 7 tmp8[k] := b[q+((ctrl+k) & 63)] ENDFOR IF k[i*8+j] dst[q+j*8+7:q+j*8] := tmp8[7:0] ELSE dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] FI ENDFOR ENDFOR dst[MAX:128] := 0
vpmultishiftqb
__m128i _mm_maskz_multishift_epi64_epi8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_multishift_epi64_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI + AVX512VL

Description

For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR i := 0 to 1 q := i * 64 FOR j := 0 to 7 tmp8 := 0 ctrl := a[q+j*8+7:q+j*8] & 63 FOR l := 0 to 7 tmp8[k] := b[q+((ctrl+k) & 63)] ENDFOR IF k[i*8+j] dst[q+j*8+7:q+j*8] := tmp8[7:0] ELSE dst[q+j*8+7:q+j*8] := 0 FI ENDFOR ENDFOR dst[MAX:128] := 0
vpmultishiftqb
__m128i _mm_multishift_epi64_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_multishift_epi64_epi8 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI + AVX512VL

Description

For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.

Operation

FOR i := 0 to 1 q := i * 64 FOR j := 0 to 7 tmp8 := 0 ctrl := a[q+j*8+7:q+j*8] & 63 FOR l := 0 to 7 tmp8[k] := b[q+((ctrl+k) & 63)] ENDFOR dst[q+j*8+7:q+j*8] := tmp8[7:0] ENDFOR ENDFOR dst[MAX:128] := 0
vpmultishiftqb
__m256i _mm256_mask_multishift_epi64_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_multishift_epi64_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI + AVX512VL

Description

For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR i := 0 to 3 q := i * 64 FOR j := 0 to 7 tmp8 := 0 ctrl := a[q+j*8+7:q+j*8] & 63 FOR l := 0 to 7 tmp8[k] := b[q+((ctrl+k) & 63)] ENDFOR IF k[i*8+j] dst[q+j*8+7:q+j*8] := tmp8[7:0] ELSE dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] FI ENDFOR ENDFOR dst[MAX:256] := 0
vpmultishiftqb
__m256i _mm256_maskz_multishift_epi64_epi8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_multishift_epi64_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI + AVX512VL

Description

For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR i := 0 to 3 q := i * 64 FOR j := 0 to 7 tmp8 := 0 ctrl := a[q+j*8+7:q+j*8] & 63 FOR l := 0 to 7 tmp8[k] := b[q+((ctrl+k) & 63)] ENDFOR IF k[i*8+j] dst[q+j*8+7:q+j*8] := tmp8[7:0] ELSE dst[q+j*8+7:q+j*8] := 0 FI ENDFOR ENDFOR dst[MAX:256] := 0
vpmultishiftqb
__m256i _mm256_multishift_epi64_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_multishift_epi64_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI + AVX512VL

Description

For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.

Operation

FOR i := 0 to 3 q := i * 64 FOR j := 0 to 7 tmp8 := 0 ctrl := a[q+j*8+7:q+j*8] & 63 FOR l := 0 to 7 tmp8[k] := b[q+((ctrl+k) & 63)] ENDFOR dst[q+j*8+7:q+j*8] := tmp8[7:0] ENDFOR ENDFOR dst[MAX:256] := 0
vpmultishiftqb
__m512i _mm512_mask_multishift_epi64_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_multishift_epi64_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI

Description

For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR i := 0 to 7 q := i * 64 FOR j := 0 to 7 tmp8 := 0 ctrl := a[q+j*8+7:q+j*8] & 63 FOR l := 0 to 7 tmp8[k] := b[q+((ctrl+k) & 63)] ENDFOR IF k[i*8+j] dst[q+j*8+7:q+j*8] := tmp8[7:0] ELSE dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] FI ENDFOR ENDFOR dst[MAX:512] := 0
vpmultishiftqb
__m512i _mm512_maskz_multishift_epi64_epi8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_multishift_epi64_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI

Description

For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR i := 0 to 7 q := i * 64 FOR j := 0 to 7 tmp8 := 0 ctrl := a[q+j*8+7:q+j*8] & 63 FOR l := 0 to 7 tmp8[k] := b[q+((ctrl+k) & 63)] ENDFOR IF k[i*8+j] dst[q+j*8+7:q+j*8] := tmp8[7:0] ELSE dst[q+j*8+7:q+j*8] := 0 FI ENDFOR ENDFOR dst[MAX:512] := 0
vpmultishiftqb
__m512i _mm512_multishift_epi64_epi8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_multishift_epi64_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI

Description

For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.

Operation

FOR i := 0 to 7 q := i * 64 FOR j := 0 to 7 tmp8 := 0 ctrl := a[q+j*8+7:q+j*8] & 63 FOR l := 0 to 7 tmp8[k] := b[q+((ctrl+k) & 63)] ENDFOR dst[q+j*8+7:q+j*8] := tmp8[7:0] ENDFOR ENDFOR dst[MAX:512] := 0
mwait
void _mm_mwait (unsigned extensions, unsigned hints)

Synopsis

void _mm_mwait (unsigned extensions, unsigned hints)
#include "pmmintrin.h"
Instruction: mwait
CPUID Flags: MONITOR

Description

Hint to the processor that it can enter an implementation-dependent-optimized state while waiting for an event or store operation to the address range specified by MONITOR.
...
__m512d _mm512_mask_nearbyint_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_nearbyint_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Rounds each packed double-precision (64-bit) floating-point element in a to the nearest integer value and stores the results as packed double-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := NearbyInt(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_nearbyint_pd (__m512d a)

Synopsis

__m512d _mm512_nearbyint_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Rounds each packed double-precision (64-bit) floating-point element in a to the nearest integer value and stores the results as packed double-precision floating-point elements in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := NearbyInt(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_nearbyint_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_nearbyint_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Rounds each packed single-precision (32-bit) floating-point element in a to the nearest integer value and stores the results as packed double-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := NearbyInt(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_nearbyint_ps (__m512 a)

Synopsis

__m512 _mm512_nearbyint_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Rounds each packed single-precision (32-bit) floating-point element in a to the nearest integer value and stores the results as packed double-precision floating-point elements in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := NearbyInt(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vpord
__m128i _mm_mask_or_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_or_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpord
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpord
__m128i _mm_maskz_or_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_or_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpord
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpord
__m256i _mm256_mask_or_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_or_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpord
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpord
__m256i _mm256_maskz_or_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_or_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpord
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpord
__m512i _mm512_mask_or_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_or_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpord zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpord
__m512i _mm512_maskz_or_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_or_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpord zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpord
__m512i _mm512_or_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_or_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpord zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] OR b[i+31:i] ENDFOR dst[MAX:512] := 0
vporq
__m128i _mm_mask_or_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_or_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vporq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vporq
__m128i _mm_maskz_or_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_or_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vporq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vporq
__m256i _mm256_mask_or_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_or_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vporq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vporq
__m256i _mm256_maskz_or_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_or_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vporq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vporq
__m512i _mm512_mask_or_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_or_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vporq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vporq
__m512i _mm512_maskz_or_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_or_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vporq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vporq
__m512i _mm512_or_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_or_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vporq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] OR b[i+63:i] ENDFOR dst[MAX:512] := 0
vorpd
__m128d _mm_mask_or_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_or_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vorpd
__m128d _mm_maskz_or_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_or_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
orpd
__m128d _mm_or_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_or_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: orpd xmm, xmm
CPUID Flags: SSE2

Description

Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.8
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vorpd
__m256d _mm256_mask_or_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_or_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vorpd
__m256d _mm256_maskz_or_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_or_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vorpd
__m256d _mm256_or_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_or_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vorpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vorpd
__m512d _mm512_mask_or_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_or_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512DQ

Description

Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vorpd
__m512d _mm512_maskz_or_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_or_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512DQ

Description

Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vorpd
__m512d _mm512_or_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_or_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512DQ

Description

Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i] ENDFOR dst[MAX:512] := 0
vorps
__m128 _mm_mask_or_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_or_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vorps
__m128 _mm_maskz_or_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_or_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
orps
__m128 _mm_or_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_or_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: orps xmm, xmm
CPUID Flags: SSE

Description

Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vorps
__m256 _mm256_mask_or_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_or_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vorps
__m256 _mm256_maskz_or_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_or_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vorps
__m256 _mm256_or_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_or_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vorps ymm, ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vorps
__m512 _mm512_mask_or_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_or_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512DQ

Description

Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vorps
__m512 _mm512_maskz_or_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_or_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512DQ

Description

Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vorps
__m512 _mm512_or_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_or_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512DQ

Description

Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i] ENDFOR dst[MAX:512] := 0
por
__m128i _mm_or_si128 (__m128i a, __m128i b)

Synopsis

__m128i _mm_or_si128 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: por xmm, xmm
CPUID Flags: SSE2

Description

Compute the bitwise OR of 128 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[127:0] := (a[127:0] OR b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vpor
__m256i _mm256_or_si256 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_or_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpor ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compute the bitwise OR of 256 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[255:0] := (a[255:0] OR b[255:0]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.33
vpord
__m512i _mm512_or_si512 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_or_si512 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpord zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise OR of 512 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[511:0] := (a[511:0] OR b[511:0]) dst[MAX:512] := 0
vpacksswb
__m128i _mm_mask_packs_epi16 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_packs_epi16 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0]) tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16]) tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32]) tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48]) tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64]) tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80]) tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96]) tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112]) tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0]) tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16]) tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32]) tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48]) tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64]) tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80]) tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96]) tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112]) FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpacksswb
__m128i _mm_maskz_packs_epi16 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_packs_epi16 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0]) tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16]) tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32]) tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48]) tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64]) tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80]) tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96]) tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112]) tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0]) tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16]) tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32]) tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48]) tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64]) tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80]) tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96]) tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112]) FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
packsswb
__m128i _mm_packs_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_packs_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: packsswb xmm, xmm
CPUID Flags: SSE2

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst.

Operation

dst[7:0] := Saturate_Int16_To_Int8 (a[15:0]) dst[15:8] := Saturate_Int16_To_Int8 (a[31:16]) dst[23:16] := Saturate_Int16_To_Int8 (a[47:32]) dst[31:24] := Saturate_Int16_To_Int8 (a[63:48]) dst[39:32] := Saturate_Int16_To_Int8 (a[79:64]) dst[47:40] := Saturate_Int16_To_Int8 (a[95:80]) dst[55:48] := Saturate_Int16_To_Int8 (a[111:96]) dst[63:56] := Saturate_Int16_To_Int8 (a[127:112]) dst[71:64] := Saturate_Int16_To_Int8 (b[15:0]) dst[79:72] := Saturate_Int16_To_Int8 (b[31:16]) dst[87:80] := Saturate_Int16_To_Int8 (b[47:32]) dst[95:88] := Saturate_Int16_To_Int8 (b[63:48]) dst[103:96] := Saturate_Int16_To_Int8 (b[79:64]) dst[111:104] := Saturate_Int16_To_Int8 (b[95:80]) dst[119:112] := Saturate_Int16_To_Int8 (b[111:96]) dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpacksswb
__m256i _mm256_mask_packs_epi16 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_packs_epi16 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0]) tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16]) tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32]) tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48]) tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64]) tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80]) tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96]) tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112]) tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0]) tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16]) tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32]) tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48]) tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64]) tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80]) tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96]) tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112]) tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128]) tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144]) tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160]) tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176]) tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192]) tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208]) tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224]) tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240]) tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128]) tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144]) tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160]) tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176]) tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192]) tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208]) tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224]) tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240]) FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpacksswb
__m256i _mm256_maskz_packs_epi16 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_packs_epi16 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0]) tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16]) tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32]) tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48]) tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64]) tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80]) tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96]) tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112]) tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0]) tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16]) tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32]) tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48]) tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64]) tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80]) tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96]) tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112]) tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128]) tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144]) tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160]) tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176]) tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192]) tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208]) tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224]) tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240]) tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128]) tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144]) tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160]) tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176]) tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192]) tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208]) tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224]) tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240]) FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpacksswb
__m256i _mm256_packs_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_packs_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpacksswb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst.

Operation

dst[7:0] := Saturate_Int16_To_Int8 (a[15:0]) dst[15:8] := Saturate_Int16_To_Int8 (a[31:16]) dst[23:16] := Saturate_Int16_To_Int8 (a[47:32]) dst[31:24] := Saturate_Int16_To_Int8 (a[63:48]) dst[39:32] := Saturate_Int16_To_Int8 (a[79:64]) dst[47:40] := Saturate_Int16_To_Int8 (a[95:80]) dst[55:48] := Saturate_Int16_To_Int8 (a[111:96]) dst[63:56] := Saturate_Int16_To_Int8 (a[127:112]) dst[71:64] := Saturate_Int16_To_Int8 (b[15:0]) dst[79:72] := Saturate_Int16_To_Int8 (b[31:16]) dst[87:80] := Saturate_Int16_To_Int8 (b[47:32]) dst[95:88] := Saturate_Int16_To_Int8 (b[63:48]) dst[103:96] := Saturate_Int16_To_Int8 (b[79:64]) dst[111:104] := Saturate_Int16_To_Int8 (b[95:80]) dst[119:112] := Saturate_Int16_To_Int8 (b[111:96]) dst[127:120] := Saturate_Int16_To_Int8 (b[127:112]) dst[135:128] := Saturate_Int16_To_Int8 (a[143:128]) dst[143:136] := Saturate_Int16_To_Int8 (a[159:144]) dst[151:144] := Saturate_Int16_To_Int8 (a[175:160]) dst[159:152] := Saturate_Int16_To_Int8 (a[191:176]) dst[167:160] := Saturate_Int16_To_Int8 (a[207:192]) dst[175:168] := Saturate_Int16_To_Int8 (a[223:208]) dst[183:176] := Saturate_Int16_To_Int8 (a[239:224]) dst[191:184] := Saturate_Int16_To_Int8 (a[255:240]) dst[199:192] := Saturate_Int16_To_Int8 (b[143:128]) dst[207:200] := Saturate_Int16_To_Int8 (b[159:144]) dst[215:208] := Saturate_Int16_To_Int8 (b[175:160]) dst[223:216] := Saturate_Int16_To_Int8 (b[191:176]) dst[231:224] := Saturate_Int16_To_Int8 (b[207:192]) dst[239:232] := Saturate_Int16_To_Int8 (b[223:208]) dst[247:240] := Saturate_Int16_To_Int8 (b[239:224]) dst[255:248] := Saturate_Int16_To_Int8 (b[255:240]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpacksswb
__m512i _mm512_mask_packs_epi16 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_packs_epi16 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0]) tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16]) tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32]) tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48]) tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64]) tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80]) tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96]) tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112]) tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0]) tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16]) tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32]) tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48]) tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64]) tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80]) tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96]) tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112]) tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128]) tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144]) tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160]) tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176]) tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192]) tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208]) tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224]) tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240]) tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128]) tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144]) tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160]) tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176]) tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192]) tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208]) tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224]) tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240]) tmp_dst[263:256] := Saturate_Int16_To_Int8 (a[271:256]) tmp_dst[271:264] := Saturate_Int16_To_Int8 (a[287:272]) tmp_dst[279:272] := Saturate_Int16_To_Int8 (a[303:288]) tmp_dst[287:280] := Saturate_Int16_To_Int8 (a[319:304]) tmp_dst[295:288] := Saturate_Int16_To_Int8 (a[335:320]) tmp_dst[303:296] := Saturate_Int16_To_Int8 (a[351:336]) tmp_dst[311:304] := Saturate_Int16_To_Int8 (a[367:352]) tmp_dst[319:312] := Saturate_Int16_To_Int8 (a[383:368]) tmp_dst[327:320] := Saturate_Int16_To_Int8 (b[271:256]) tmp_dst[335:328] := Saturate_Int16_To_Int8 (b[287:272]) tmp_dst[343:336] := Saturate_Int16_To_Int8 (b[303:288]) tmp_dst[351:344] := Saturate_Int16_To_Int8 (b[319:304]) tmp_dst[359:352] := Saturate_Int16_To_Int8 (b[335:320]) tmp_dst[367:360] := Saturate_Int16_To_Int8 (b[351:336]) tmp_dst[375:368] := Saturate_Int16_To_Int8 (b[367:352]) tmp_dst[383:376] := Saturate_Int16_To_Int8 (b[383:368]) tmp_dst[391:384] := Saturate_Int16_To_Int8 (a[399:384]) tmp_dst[399:392] := Saturate_Int16_To_Int8 (a[415:400]) tmp_dst[407:400] := Saturate_Int16_To_Int8 (a[431:416]) tmp_dst[415:408] := Saturate_Int16_To_Int8 (a[447:432]) tmp_dst[423:416] := Saturate_Int16_To_Int8 (a[463:448]) tmp_dst[431:424] := Saturate_Int16_To_Int8 (a[479:464]) tmp_dst[439:432] := Saturate_Int16_To_Int8 (a[495:480]) tmp_dst[447:440] := Saturate_Int16_To_Int8 (a[511:496]) tmp_dst[455:448] := Saturate_Int16_To_Int8 (b[399:384]) tmp_dst[463:456] := Saturate_Int16_To_Int8 (b[415:400]) tmp_dst[471:464] := Saturate_Int16_To_Int8 (b[431:416]) tmp_dst[479:472] := Saturate_Int16_To_Int8 (b[447:432]) tmp_dst[487:480] := Saturate_Int16_To_Int8 (b[463:448]) tmp_dst[495:488] := Saturate_Int16_To_Int8 (b[479:464]) tmp_dst[503:496] := Saturate_Int16_To_Int8 (b[495:480]) tmp_dst[511:504] := Saturate_Int16_To_Int8 (b[511:496]) FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpacksswb
__m512i _mm512_maskz_packs_epi16 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_packs_epi16 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0]) tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16]) tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32]) tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48]) tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64]) tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80]) tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96]) tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112]) tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0]) tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16]) tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32]) tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48]) tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64]) tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80]) tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96]) tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112]) tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128]) tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144]) tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160]) tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176]) tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192]) tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208]) tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224]) tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240]) tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128]) tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144]) tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160]) tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176]) tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192]) tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208]) tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224]) tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240]) tmp_dst[263:256] := Saturate_Int16_To_Int8 (a[271:256]) tmp_dst[271:264] := Saturate_Int16_To_Int8 (a[287:272]) tmp_dst[279:272] := Saturate_Int16_To_Int8 (a[303:288]) tmp_dst[287:280] := Saturate_Int16_To_Int8 (a[319:304]) tmp_dst[295:288] := Saturate_Int16_To_Int8 (a[335:320]) tmp_dst[303:296] := Saturate_Int16_To_Int8 (a[351:336]) tmp_dst[311:304] := Saturate_Int16_To_Int8 (a[367:352]) tmp_dst[319:312] := Saturate_Int16_To_Int8 (a[383:368]) tmp_dst[327:320] := Saturate_Int16_To_Int8 (b[271:256]) tmp_dst[335:328] := Saturate_Int16_To_Int8 (b[287:272]) tmp_dst[343:336] := Saturate_Int16_To_Int8 (b[303:288]) tmp_dst[351:344] := Saturate_Int16_To_Int8 (b[319:304]) tmp_dst[359:352] := Saturate_Int16_To_Int8 (b[335:320]) tmp_dst[367:360] := Saturate_Int16_To_Int8 (b[351:336]) tmp_dst[375:368] := Saturate_Int16_To_Int8 (b[367:352]) tmp_dst[383:376] := Saturate_Int16_To_Int8 (b[383:368]) tmp_dst[391:384] := Saturate_Int16_To_Int8 (a[399:384]) tmp_dst[399:392] := Saturate_Int16_To_Int8 (a[415:400]) tmp_dst[407:400] := Saturate_Int16_To_Int8 (a[431:416]) tmp_dst[415:408] := Saturate_Int16_To_Int8 (a[447:432]) tmp_dst[423:416] := Saturate_Int16_To_Int8 (a[463:448]) tmp_dst[431:424] := Saturate_Int16_To_Int8 (a[479:464]) tmp_dst[439:432] := Saturate_Int16_To_Int8 (a[495:480]) tmp_dst[447:440] := Saturate_Int16_To_Int8 (a[511:496]) tmp_dst[455:448] := Saturate_Int16_To_Int8 (b[399:384]) tmp_dst[463:456] := Saturate_Int16_To_Int8 (b[415:400]) tmp_dst[471:464] := Saturate_Int16_To_Int8 (b[431:416]) tmp_dst[479:472] := Saturate_Int16_To_Int8 (b[447:432]) tmp_dst[487:480] := Saturate_Int16_To_Int8 (b[463:448]) tmp_dst[495:488] := Saturate_Int16_To_Int8 (b[479:464]) tmp_dst[503:496] := Saturate_Int16_To_Int8 (b[495:480]) tmp_dst[511:504] := Saturate_Int16_To_Int8 (b[511:496]) FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpacksswb
__m512i _mm512_packs_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_packs_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst.

Operation

dst[7:0] := Saturate_Int16_To_Int8 (a[15:0]) dst[15:8] := Saturate_Int16_To_Int8 (a[31:16]) dst[23:16] := Saturate_Int16_To_Int8 (a[47:32]) dst[31:24] := Saturate_Int16_To_Int8 (a[63:48]) dst[39:32] := Saturate_Int16_To_Int8 (a[79:64]) dst[47:40] := Saturate_Int16_To_Int8 (a[95:80]) dst[55:48] := Saturate_Int16_To_Int8 (a[111:96]) dst[63:56] := Saturate_Int16_To_Int8 (a[127:112]) dst[71:64] := Saturate_Int16_To_Int8 (b[15:0]) dst[79:72] := Saturate_Int16_To_Int8 (b[31:16]) dst[87:80] := Saturate_Int16_To_Int8 (b[47:32]) dst[95:88] := Saturate_Int16_To_Int8 (b[63:48]) dst[103:96] := Saturate_Int16_To_Int8 (b[79:64]) dst[111:104] := Saturate_Int16_To_Int8 (b[95:80]) dst[119:112] := Saturate_Int16_To_Int8 (b[111:96]) dst[127:120] := Saturate_Int16_To_Int8 (b[127:112]) dst[135:128] := Saturate_Int16_To_Int8 (a[143:128]) dst[143:136] := Saturate_Int16_To_Int8 (a[159:144]) dst[151:144] := Saturate_Int16_To_Int8 (a[175:160]) dst[159:152] := Saturate_Int16_To_Int8 (a[191:176]) dst[167:160] := Saturate_Int16_To_Int8 (a[207:192]) dst[175:168] := Saturate_Int16_To_Int8 (a[223:208]) dst[183:176] := Saturate_Int16_To_Int8 (a[239:224]) dst[191:184] := Saturate_Int16_To_Int8 (a[255:240]) dst[199:192] := Saturate_Int16_To_Int8 (b[143:128]) dst[207:200] := Saturate_Int16_To_Int8 (b[159:144]) dst[215:208] := Saturate_Int16_To_Int8 (b[175:160]) dst[223:216] := Saturate_Int16_To_Int8 (b[191:176]) dst[231:224] := Saturate_Int16_To_Int8 (b[207:192]) dst[239:232] := Saturate_Int16_To_Int8 (b[223:208]) dst[247:240] := Saturate_Int16_To_Int8 (b[239:224]) dst[255:248] := Saturate_Int16_To_Int8 (b[255:240]) dst[263:256] := Saturate_Int16_To_Int8 (a[271:256]) dst[271:264] := Saturate_Int16_To_Int8 (a[287:272]) dst[279:272] := Saturate_Int16_To_Int8 (a[303:288]) dst[287:280] := Saturate_Int16_To_Int8 (a[319:304]) dst[295:288] := Saturate_Int16_To_Int8 (a[335:320]) dst[303:296] := Saturate_Int16_To_Int8 (a[351:336]) dst[311:304] := Saturate_Int16_To_Int8 (a[367:352]) dst[319:312] := Saturate_Int16_To_Int8 (a[383:368]) dst[327:320] := Saturate_Int16_To_Int8 (b[271:256]) dst[335:328] := Saturate_Int16_To_Int8 (b[287:272]) dst[343:336] := Saturate_Int16_To_Int8 (b[303:288]) dst[351:344] := Saturate_Int16_To_Int8 (b[319:304]) dst[359:352] := Saturate_Int16_To_Int8 (b[335:320]) dst[367:360] := Saturate_Int16_To_Int8 (b[351:336]) dst[375:368] := Saturate_Int16_To_Int8 (b[367:352]) dst[383:376] := Saturate_Int16_To_Int8 (b[383:368]) dst[391:384] := Saturate_Int16_To_Int8 (a[399:384]) dst[399:392] := Saturate_Int16_To_Int8 (a[415:400]) dst[407:400] := Saturate_Int16_To_Int8 (a[431:416]) dst[415:408] := Saturate_Int16_To_Int8 (a[447:432]) dst[423:416] := Saturate_Int16_To_Int8 (a[463:448]) dst[431:424] := Saturate_Int16_To_Int8 (a[479:464]) dst[439:432] := Saturate_Int16_To_Int8 (a[495:480]) dst[447:440] := Saturate_Int16_To_Int8 (a[511:496]) dst[455:448] := Saturate_Int16_To_Int8 (b[399:384]) dst[463:456] := Saturate_Int16_To_Int8 (b[415:400]) dst[471:464] := Saturate_Int16_To_Int8 (b[431:416]) dst[479:472] := Saturate_Int16_To_Int8 (b[447:432]) dst[487:480] := Saturate_Int16_To_Int8 (b[463:448]) dst[495:488] := Saturate_Int16_To_Int8 (b[479:464]) dst[503:496] := Saturate_Int16_To_Int8 (b[495:480]) dst[511:504] := Saturate_Int16_To_Int8 (b[511:496]) dst[MAX:512] := 0
vpackssdw
__m128i _mm_mask_packs_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_packs_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0]) tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32]) tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64]) tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96]) tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0]) tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32]) tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64]) tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96]) FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpackssdw
__m128i _mm_maskz_packs_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_packs_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0]) tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32]) tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64]) tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96]) tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0]) tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32]) tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64]) tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96]) FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
packssdw
__m128i _mm_packs_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_packs_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: packssdw xmm, xmm
CPUID Flags: SSE2

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.

Operation

dst[15:0] := Saturate_Int32_To_Int16 (a[31:0]) dst[31:16] := Saturate_Int32_To_Int16 (a[63:32]) dst[47:32] := Saturate_Int32_To_Int16 (a[95:64]) dst[63:48] := Saturate_Int32_To_Int16 (a[127:96]) dst[79:64] := Saturate_Int32_To_Int16 (b[31:0]) dst[95:80] := Saturate_Int32_To_Int16 (b[63:32]) dst[111:96] := Saturate_Int32_To_Int16 (b[95:64]) dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpackssdw
__m256i _mm256_mask_packs_epi32 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_packs_epi32 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0]) tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32]) tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64]) tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96]) tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0]) tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32]) tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64]) tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96]) tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128]) tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160]) tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192]) tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224]) tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128]) tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160]) tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192]) tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224]) FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpackssdw
__m256i _mm256_maskz_packs_epi32 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_packs_epi32 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0]) tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32]) tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64]) tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96]) tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0]) tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32]) tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64]) tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96]) tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128]) tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160]) tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192]) tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224]) tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128]) tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160]) tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192]) tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224]) FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpackssdw
__m256i _mm256_packs_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_packs_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackssdw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.

Operation

dst[15:0] := Saturate_Int32_To_Int16 (a[31:0]) dst[31:16] := Saturate_Int32_To_Int16 (a[63:32]) dst[47:32] := Saturate_Int32_To_Int16 (a[95:64]) dst[63:48] := Saturate_Int32_To_Int16 (a[127:96]) dst[79:64] := Saturate_Int32_To_Int16 (b[31:0]) dst[95:80] := Saturate_Int32_To_Int16 (b[63:32]) dst[111:96] := Saturate_Int32_To_Int16 (b[95:64]) dst[127:112] := Saturate_Int32_To_Int16 (b[127:96]) dst[143:128] := Saturate_Int32_To_Int16 (a[159:128]) dst[159:144] := Saturate_Int32_To_Int16 (a[191:160]) dst[175:160] := Saturate_Int32_To_Int16 (a[223:192]) dst[191:176] := Saturate_Int32_To_Int16 (a[255:224]) dst[207:192] := Saturate_Int32_To_Int16 (b[159:128]) dst[223:208] := Saturate_Int32_To_Int16 (b[191:160]) dst[239:224] := Saturate_Int32_To_Int16 (b[223:192]) dst[255:240] := Saturate_Int32_To_Int16 (b[255:224]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpackssdw
__m512i _mm512_mask_packs_epi32 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_packs_epi32 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0]) tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32]) tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64]) tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96]) tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0]) tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32]) tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64]) tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96]) tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128]) tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160]) tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192]) tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224]) tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128]) tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160]) tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192]) tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224]) tmp_dst[271:256] := Saturate_Int32_To_Int16 (a[287:256]) tmp_dst[287:272] := Saturate_Int32_To_Int16 (a[319:288]) tmp_dst[303:288] := Saturate_Int32_To_Int16 (a[351:320]) tmp_dst[319:304] := Saturate_Int32_To_Int16 (a[383:352]) tmp_dst[335:320] := Saturate_Int32_To_Int16 (b[287:256]) tmp_dst[351:336] := Saturate_Int32_To_Int16 (b[319:288]) tmp_dst[367:352] := Saturate_Int32_To_Int16 (b[351:320]) tmp_dst[383:368] := Saturate_Int32_To_Int16 (b[383:352]) tmp_dst[399:384] := Saturate_Int32_To_Int16 (a[415:384]) tmp_dst[415:400] := Saturate_Int32_To_Int16 (a[447:416]) tmp_dst[431:416] := Saturate_Int32_To_Int16 (a[479:448]) tmp_dst[447:432] := Saturate_Int32_To_Int16 (a[511:480]) tmp_dst[463:448] := Saturate_Int32_To_Int16 (b[415:384]) tmp_dst[479:464] := Saturate_Int32_To_Int16 (b[447:416]) tmp_dst[495:480] := Saturate_Int32_To_Int16 (b[479:448]) tmp_dst[511:496] := Saturate_Int32_To_Int16 (b[511:480]) FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpackssdw
__m512i _mm512_maskz_packs_epi32 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_packs_epi32 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0]) tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32]) tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64]) tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96]) tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0]) tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32]) tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64]) tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96]) tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128]) tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160]) tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192]) tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224]) tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128]) tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160]) tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192]) tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224]) tmp_dst[271:256] := Saturate_Int32_To_Int16 (a[287:256]) tmp_dst[287:272] := Saturate_Int32_To_Int16 (a[319:288]) tmp_dst[303:288] := Saturate_Int32_To_Int16 (a[351:320]) tmp_dst[319:304] := Saturate_Int32_To_Int16 (a[383:352]) tmp_dst[335:320] := Saturate_Int32_To_Int16 (b[287:256]) tmp_dst[351:336] := Saturate_Int32_To_Int16 (b[319:288]) tmp_dst[367:352] := Saturate_Int32_To_Int16 (b[351:320]) tmp_dst[383:368] := Saturate_Int32_To_Int16 (b[383:352]) tmp_dst[399:384] := Saturate_Int32_To_Int16 (a[415:384]) tmp_dst[415:400] := Saturate_Int32_To_Int16 (a[447:416]) tmp_dst[431:416] := Saturate_Int32_To_Int16 (a[479:448]) tmp_dst[447:432] := Saturate_Int32_To_Int16 (a[511:480]) tmp_dst[463:448] := Saturate_Int32_To_Int16 (b[415:384]) tmp_dst[479:464] := Saturate_Int32_To_Int16 (b[447:416]) tmp_dst[495:480] := Saturate_Int32_To_Int16 (b[479:448]) tmp_dst[511:496] := Saturate_Int32_To_Int16 (b[511:480]) FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpackssdw
__m512i _mm512_packs_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_packs_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.

Operation

dst[15:0] := Saturate_Int32_To_Int16 (a[31:0]) dst[31:16] := Saturate_Int32_To_Int16 (a[63:32]) dst[47:32] := Saturate_Int32_To_Int16 (a[95:64]) dst[63:48] := Saturate_Int32_To_Int16 (a[127:96]) dst[79:64] := Saturate_Int32_To_Int16 (b[31:0]) dst[95:80] := Saturate_Int32_To_Int16 (b[63:32]) dst[111:96] := Saturate_Int32_To_Int16 (b[95:64]) dst[127:112] := Saturate_Int32_To_Int16 (b[127:96]) dst[143:128] := Saturate_Int32_To_Int16 (a[159:128]) dst[159:144] := Saturate_Int32_To_Int16 (a[191:160]) dst[175:160] := Saturate_Int32_To_Int16 (a[223:192]) dst[191:176] := Saturate_Int32_To_Int16 (a[255:224]) dst[207:192] := Saturate_Int32_To_Int16 (b[159:128]) dst[223:208] := Saturate_Int32_To_Int16 (b[191:160]) dst[239:224] := Saturate_Int32_To_Int16 (b[223:192]) dst[255:240] := Saturate_Int32_To_Int16 (b[255:224]) dst[271:256] := Saturate_Int32_To_Int16 (a[287:256]) dst[287:272] := Saturate_Int32_To_Int16 (a[319:288]) dst[303:288] := Saturate_Int32_To_Int16 (a[351:320]) dst[319:304] := Saturate_Int32_To_Int16 (a[383:352]) dst[335:320] := Saturate_Int32_To_Int16 (b[287:256]) dst[351:336] := Saturate_Int32_To_Int16 (b[319:288]) dst[367:352] := Saturate_Int32_To_Int16 (b[351:320]) dst[383:368] := Saturate_Int32_To_Int16 (b[383:352]) dst[399:384] := Saturate_Int32_To_Int16 (a[415:384]) dst[415:400] := Saturate_Int32_To_Int16 (a[447:416]) dst[431:416] := Saturate_Int32_To_Int16 (a[479:448]) dst[447:432] := Saturate_Int32_To_Int16 (a[511:480]) dst[463:448] := Saturate_Int32_To_Int16 (b[415:384]) dst[479:464] := Saturate_Int32_To_Int16 (b[447:416]) dst[495:480] := Saturate_Int32_To_Int16 (b[479:448]) dst[511:496] := Saturate_Int32_To_Int16 (b[511:480]) dst[MAX:512] := 0
vpackstorehd
void _mm512_mask_packstorehi_epi32 (void* mt, __mmask16 k, __m512i v1)

Synopsis

void _mm512_mask_packstorehi_epi32 (void* mt, __mmask16 k, __m512i v1)
#include "immintrin.h"
Instruction: vpackstorehd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed 32-bit integer elements of v1 into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elements of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

storeOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 15 IF k[j] IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*4) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*32 MEM[addr + storeOffset*4] := v1[i+31:i] FI storeOffset := storeOffset + 1 FI ENDFOR dst[MAX:512] := 0
vpackstorehd
void _mm512_packstorehi_epi32 (void* mt, __m512i v1)

Synopsis

void _mm512_packstorehi_epi32 (void* mt, __m512i v1)
#include "immintrin.h"
Instruction: vpackstorehd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed 32-bit integer elements of v1 into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elements of the stream that map at or after the first 64-byte-aligned address following (m5-64)).

Operation

storeOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 15 IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*4) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*32 MEM[addr + storeOffset*4] := v1[i+31:i] FI storeOffset := storeOffset + 1 ENDFOR dst[MAX:512] := 0
vpackstorehq
void _mm512_mask_packstorehi_epi64 (void* mt, __mmask8 k, __m512i v1)

Synopsis

void _mm512_mask_packstorehi_epi64 (void* mt, __mmask8 k, __m512i v1)
#include "immintrin.h"
Instruction: vpackstorehq m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed 64-bit integer elements of v1 into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

storeOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 7 IF k[j] IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*8) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*64 MEM[addr + storeOffset*8] := v1[i+63:i] FI storeOffset := storeOffset + 1 FI ENDFOR dst[MAX:512] := 0
vpackstorehq
void _mm512_packstorehi_epi64 (void* mt, __m512i v1)

Synopsis

void _mm512_packstorehi_epi64 (void* mt, __m512i v1)
#include "immintrin.h"
Instruction: vpackstorehq m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed 64-bit integer elements of v1 into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)).

Operation

storeOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 7 IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*8) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*64 MEM[addr + storeOffset*8] := v1[i+63:i] FI storeOffset := storeOffset + 1 ENDFOR dst[MAX:512] := 0
vpackstorehpd
void _mm512_mask_packstorehi_pd (void* mt, __mmask8 k, __m512d v1)

Synopsis

void _mm512_mask_packstorehi_pd (void* mt, __mmask8 k, __m512d v1)
#include "immintrin.h"
Instruction: vpackstorehpd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

storeOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 7 IF k[j] IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*8) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*64 MEM[addr + storeOffset*4] := v1[i+63:i] FI storeOffset := storeOffset + 1 FI ENDFOR dst[MAX:512] := 0
vpackstorehpd
void _mm512_packstorehi_pd (void* mt, __m512d v1)

Synopsis

void _mm512_packstorehi_pd (void* mt, __m512d v1)
#include "immintrin.h"
Instruction: vpackstorehpd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)).

Operation

storeOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 7 IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*8) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*64 MEM[addr + storeOffset*4] := v1[i+63:i] FI storeOffset := storeOffset + 1 ENDFOR dst[MAX:512] := 0
vpackstorehps
void _mm512_mask_packstorehi_ps (void* mt, __mmask16 k, __m512 v1)

Synopsis

void _mm512_mask_packstorehi_ps (void* mt, __mmask16 k, __m512 v1)
#include "immintrin.h"
Instruction: vpackstorehps m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed single-precision (32-bit) floating-point elements of v1 into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

storeOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 15 IF k[j] IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*4) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*32 MEM[addr + storeOffset*4] := v1[i+31:i] FI storeOffset := storeOffset + 1 FI ENDFOR dst[MAX:512] := 0
vpackstorehps
void _mm512_packstorehi_ps (void* mt, __m512 v1)

Synopsis

void _mm512_packstorehi_ps (void* mt, __m512 v1)
#include "immintrin.h"
Instruction: vpackstorehps m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed single-precision (32-bit) floating-point elements of v1 into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)).

Operation

storeOffset := 0 foundNext64BytesBoundary := false addr = mt-64 FOR j := 0 to 15 IF foundNext64BytesBoundary == false IF ((addr + (storeOffset + 1)*4) % 64) == 0 foundNext64BytesBoundary = true FI ELSE i := j*32 MEM[addr + storeOffset*4] := v1[i+31:i] FI storeOffset := storeOffset + 1 ENDFOR dst[MAX:512] := 0
vpackstoreld
void _mm512_mask_packstorelo_epi32 (void* mt, __mmask16 k, __m512i v1)

Synopsis

void _mm512_mask_packstorelo_epi32 (void* mt, __mmask16 k, __m512i v1)
#include "immintrin.h"
Instruction: vpackstoreld m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed 32-bit integer elements of v1 into a doubleword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

storeOffset := 0 addr = mt FOR j := 0 to 15 IF k[j] i := j*32 MEM[addr + storeOffset*4] := v1[i+31:i] storeOffset := storeOffset + 1 IF ((addr + storeOffset*4) % 64) == 0 BREAK FI FI ENDFOR dst[MAX:512] := 0
vpackstoreld
void _mm512_packstorelo_epi32 (void* mt, __m512i v1)

Synopsis

void _mm512_packstorelo_epi32 (void* mt, __m512i v1)
#include "immintrin.h"
Instruction: vpackstoreld m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed 32-bit integer elements of v1 into a doubleword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt).

Operation

storeOffset := 0 addr = mt FOR j := 0 to 15 i := j*32 MEM[addr + storeOffset*4] := v1[i+31:i] storeOffset := storeOffset + 1 IF ((addr + storeOffset*4) % 64) == 0 BREAK FI ENDFOR dst[MAX:512] := 0
vpackstorelq
void _mm512_mask_packstorelo_epi64 (void* mt, __mmask8 k, __m512i v1)

Synopsis

void _mm512_mask_packstorelo_epi64 (void* mt, __mmask8 k, __m512i v1)
#include "immintrin.h"
Instruction: vpackstorelq m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed 64-bit integer elements of v1 into a quadword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

storeOffset := 0 addr = mt FOR j := 0 to 7 IF k[j] i := j*64 MEM[addr + storeOffset*8] := v1[i+63:i] storeOffset := storeOffset + 1 IF ((addr + storeOffset*8) % 64) == 0 BREAK FI FI ENDFOR dst[MAX:512] := 0
vpackstorelq
void _mm512_packstorelo_epi64 (void* mt, __m512i v1)

Synopsis

void _mm512_packstorelo_epi64 (void* mt, __m512i v1)
#include "immintrin.h"
Instruction: vpackstorelq m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed 64-bit integer elements of v1 into a quadword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt).

Operation

storeOffset := 0 addr = mt FOR j := 0 to 7 i := j*64 MEM[addr + storeOffset*8] := v1[i+63:i] storeOffset := storeOffset + 1 IF ((addr + storeOffset*8) % 64) == 0 BREAK FI ENDFOR dst[MAX:512] := 0
vpackstorelpd
void _mm512_mask_packstorelo_pd (void* mt, __mmask8 k, __m512d v1)

Synopsis

void _mm512_mask_packstorelo_pd (void* mt, __mmask8 k, __m512d v1)
#include "immintrin.h"
Instruction: vpackstorelpd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

storeOffset := 0 addr = mt FOR j := 0 to 7 IF k[j] i := j*64 MEM[addr + storeOffset*8] := v1[i+63:i] storeOffset := storeOffset + 1 IF ((addr + storeOffset*8) % 64) == 0 BREAK FI FI ENDFOR dst[MAX:512] := 0
vpackstorelpd
void _mm512_packstorelo_pd (void* mt, __m512d v1)

Synopsis

void _mm512_packstorelo_pd (void* mt, __m512d v1)
#include "immintrin.h"
Instruction: vpackstorelpd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt).

Operation

storeOffset := 0 addr = mt FOR j := 0 to 7 i := j*64 MEM[addr + storeOffset*8] := v1[i+63:i] storeOffset := storeOffset + 1 IF ((addr + storeOffset*8) % 64) == 0 BREAK FI ENDFOR dst[MAX:512] := 0
vpackstorelps
void _mm512_mask_packstorelo_ps (void* mt, __mmask16 k, __m512 v1)

Synopsis

void _mm512_mask_packstorelo_ps (void* mt, __mmask16 k, __m512 v1)
#include "immintrin.h"
Instruction: vpackstorelps m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed single-precision (32-bit) floating-point elements of v1 into a doubleword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).

Operation

storeOffset := 0 addr = mt FOR j := 0 to 15 IF k[j] i := j*32 MEM[addr + storeOffset*4] := v1[i+31:i] storeOffset := storeOffset + 1 IF ((addr + storeOffset*4) % 64) == 0 BREAK FI FI ENDFOR dst[MAX:512] := 0
vpackstorelps
void _mm512_packstorelo_ps (void* mt, __m512 v1)

Synopsis

void _mm512_packstorelo_ps (void* mt, __m512 v1)
#include "immintrin.h"
Instruction: vpackstorelps m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed single-precision (32-bit) floating-point elements of v1 into a doubleword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt).

Operation

storeOffset := 0 addr = mt FOR j := 0 to 15 i := j*32 MEM[addr + storeOffset*4] := v1[i+31:i] storeOffset := storeOffset + 1 IF ((addr + storeOffset*4) % 64) == 0 BREAK FI ENDFOR dst[MAX:512] := 0
vpackuswb
__m128i _mm_mask_packus_epi16 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_packus_epi16 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0]) tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16]) tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32]) tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48]) tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64]) tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80]) tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96]) tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112]) tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0]) tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16]) tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32]) tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48]) tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64]) tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80]) tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96]) tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112]) FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpackuswb
__m128i _mm_maskz_packus_epi16 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_packus_epi16 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0]) tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16]) tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32]) tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48]) tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64]) tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80]) tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96]) tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112]) tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0]) tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16]) tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32]) tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48]) tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64]) tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80]) tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96]) tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112]) FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
packuswb
__m128i _mm_packus_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_packus_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: packuswb xmm, xmm
CPUID Flags: SSE2

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst.

Operation

dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0]) dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16]) dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32]) dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48]) dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64]) dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80]) dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96]) dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112]) dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0]) dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16]) dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32]) dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48]) dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64]) dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80]) dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96]) dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpackuswb
__m256i _mm256_mask_packus_epi16 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_packus_epi16 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0]) tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16]) tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32]) tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48]) tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64]) tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80]) tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96]) tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112]) tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0]) tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16]) tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32]) tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48]) tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64]) tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80]) tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96]) tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112]) tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128]) tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144]) tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160]) tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176]) tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192]) tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208]) tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224]) tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240]) tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128]) tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144]) tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160]) tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176]) tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192]) tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208]) tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224]) tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240]) FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpackuswb
__m256i _mm256_maskz_packus_epi16 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_packus_epi16 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0]) tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16]) tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32]) tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48]) tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64]) tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80]) tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96]) tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112]) tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0]) tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16]) tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32]) tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48]) tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64]) tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80]) tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96]) tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112]) tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128]) tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144]) tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160]) tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176]) tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192]) tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208]) tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224]) tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240]) tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128]) tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144]) tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160]) tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176]) tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192]) tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208]) tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224]) tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240]) FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpackuswb
__m256i _mm256_packus_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_packus_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackuswb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst.

Operation

dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0]) dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16]) dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32]) dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48]) dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64]) dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80]) dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96]) dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112]) dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0]) dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16]) dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32]) dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48]) dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64]) dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80]) dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96]) dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112]) dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128]) dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144]) dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160]) dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176]) dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192]) dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208]) dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224]) dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240]) dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128]) dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144]) dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160]) dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176]) dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192]) dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208]) dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224]) dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpackuswb
__m512i _mm512_mask_packus_epi16 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_packus_epi16 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0]) tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16]) tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32]) tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48]) tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64]) tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80]) tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96]) tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112]) tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0]) tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16]) tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32]) tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48]) tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64]) tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80]) tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96]) tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112]) tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128]) tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144]) tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160]) tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176]) tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192]) tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208]) tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224]) tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240]) tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128]) tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144]) tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160]) tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176]) tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192]) tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208]) tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224]) tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240]) tmp_dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256]) tmp_dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272]) tmp_dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288]) tmp_dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304]) tmp_dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320]) tmp_dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336]) tmp_dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352]) tmp_dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368]) tmp_dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256]) tmp_dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272]) tmp_dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288]) tmp_dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304]) tmp_dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320]) tmp_dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336]) tmp_dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352]) tmp_dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368]) tmp_dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384]) tmp_dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400]) tmp_dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416]) tmp_dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432]) tmp_dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448]) tmp_dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464]) tmp_dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480]) tmp_dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496]) tmp_dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384]) tmp_dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400]) tmp_dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416]) tmp_dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432]) tmp_dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448]) tmp_dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464]) tmp_dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480]) tmp_dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496]) FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpackuswb
__m512i _mm512_maskz_packus_epi16 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_packus_epi16 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0]) tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16]) tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32]) tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48]) tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64]) tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80]) tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96]) tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112]) tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0]) tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16]) tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32]) tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48]) tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64]) tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80]) tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96]) tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112]) tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128]) tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144]) tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160]) tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176]) tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192]) tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208]) tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224]) tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240]) tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128]) tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144]) tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160]) tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176]) tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192]) tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208]) tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224]) tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240]) tmp_dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256]) tmp_dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272]) tmp_dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288]) tmp_dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304]) tmp_dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320]) tmp_dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336]) tmp_dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352]) tmp_dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368]) tmp_dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256]) tmp_dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272]) tmp_dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288]) tmp_dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304]) tmp_dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320]) tmp_dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336]) tmp_dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352]) tmp_dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368]) tmp_dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384]) tmp_dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400]) tmp_dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416]) tmp_dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432]) tmp_dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448]) tmp_dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464]) tmp_dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480]) tmp_dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496]) tmp_dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384]) tmp_dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400]) tmp_dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416]) tmp_dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432]) tmp_dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448]) tmp_dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464]) tmp_dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480]) tmp_dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496]) FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpackuswb
__m512i _mm512_packus_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_packus_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512BW

Description

Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst.

Operation

dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0]) dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16]) dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32]) dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48]) dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64]) dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80]) dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96]) dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112]) dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0]) dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16]) dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32]) dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48]) dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64]) dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80]) dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96]) dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112]) dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128]) dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144]) dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160]) dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176]) dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192]) dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208]) dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224]) dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240]) dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128]) dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144]) dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160]) dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176]) dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192]) dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208]) dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224]) dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240]) dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256]) dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272]) dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288]) dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304]) dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320]) dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336]) dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352]) dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368]) dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256]) dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272]) dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288]) dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304]) dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320]) dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336]) dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352]) dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368]) dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384]) dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400]) dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416]) dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432]) dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448]) dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464]) dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480]) dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496]) dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384]) dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400]) dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416]) dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432]) dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448]) dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464]) dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480]) dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496]) dst[MAX:512] := 0
vpackusdw
__m128i _mm_mask_packus_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_packus_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0]) tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32]) tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64]) tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96]) tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0]) tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32]) tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64]) tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96]) FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpackusdw
__m128i _mm_maskz_packus_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_packus_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0]) tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32]) tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64]) tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96]) tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0]) tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32]) tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64]) tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96]) FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
packusdw
__m128i _mm_packus_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_packus_epi32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: packusdw xmm, xmm
CPUID Flags: SSE4.1

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.

Operation

dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0]) dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32]) dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64]) dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96]) dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0]) dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32]) dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64]) dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpackusdw
__m256i _mm256_mask_packus_epi32 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_packus_epi32 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0]) tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32]) tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64]) tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96]) tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0]) tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32]) tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64]) tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96]) tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128]) tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160]) tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192]) tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224]) tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128]) tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160]) tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192]) tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224]) FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpackusdw
__m256i _mm256_maskz_packus_epi32 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_packus_epi32 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512VL + AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0]) tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32]) tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64]) tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96]) tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0]) tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32]) tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64]) tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96]) tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128]) tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160]) tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192]) tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224]) tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128]) tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160]) tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192]) tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224]) FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpackusdw
__m256i _mm256_packus_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_packus_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackusdw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.

Operation

dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0]) dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32]) dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64]) dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96]) dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0]) dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32]) dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64]) dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96]) dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128]) dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160]) dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192]) dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224]) dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128]) dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160]) dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192]) dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpackusdw
__m512i _mm512_mask_packus_epi32 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_packus_epi32 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0]) tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32]) tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64]) tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96]) tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0]) tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32]) tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64]) tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96]) tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128]) tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160]) tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192]) tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224]) tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128]) tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160]) tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192]) tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224]) tmp_dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256]) tmp_dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288]) tmp_dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320]) tmp_dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352]) tmp_dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256]) tmp_dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288]) tmp_dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320]) tmp_dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352]) tmp_dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384]) tmp_dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416]) tmp_dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448]) tmp_dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480]) tmp_dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384]) tmp_dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416]) tmp_dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448]) tmp_dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480]) FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpackusdw
__m512i _mm512_maskz_packus_epi32 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_packus_epi32 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0]) tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32]) tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64]) tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96]) tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0]) tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32]) tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64]) tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96]) tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128]) tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160]) tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192]) tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224]) tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128]) tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160]) tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192]) tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224]) tmp_dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256]) tmp_dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288]) tmp_dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320]) tmp_dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352]) tmp_dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256]) tmp_dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288]) tmp_dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320]) tmp_dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352]) tmp_dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384]) tmp_dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416]) tmp_dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448]) tmp_dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480]) tmp_dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384]) tmp_dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416]) tmp_dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448]) tmp_dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480]) FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpackusdw
__m512i _mm512_packus_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_packus_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512BW

Description

Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.

Operation

dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0]) dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32]) dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64]) dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96]) dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0]) dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32]) dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64]) dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96]) dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128]) dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160]) dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192]) dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224]) dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128]) dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160]) dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192]) dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224]) dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256]) dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288]) dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320]) dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352]) dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256]) dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288]) dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320]) dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352]) dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384]) dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416]) dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448]) dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480]) dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384]) dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416]) dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448]) dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480]) dst[MAX:512] := 0
pause
void _mm_pause (void)

Synopsis

void _mm_pause (void)
#include "emmintrin.h"
Instruction: pause
CPUID Flags: SSE2

Description

Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance and power consumption of spin-wait loops.

Performance

ArchitectureLatencyThroughput
Haswell5-
Ivy Bridge4-
Sandy Bridge4-
Westmere5-
Nehalem5-
pavgb
__m64 _m_pavgb (__m64 a, __m64 b)

Synopsis

__m64 _m_pavgb (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pavgb mm, mm
CPUID Flags: SSE

Description

Average packed unsigned 8-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*8 dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ENDFOR
pavgw
__m64 _m_pavgw (__m64 a, __m64 b)

Synopsis

__m64 _m_pavgw (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pavgw mm, mm
CPUID Flags: SSE

Description

Average packed unsigned 16-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*16 dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ENDFOR
pdep
unsigned int _pdep_u32 (unsigned int a, unsigned int mask)

Synopsis

unsigned int _pdep_u32 (unsigned int a, unsigned int mask)
#include "immintrin.h"
Instruction: pdep r32, r32, r32
CPUID Flags: BMI2

Description

Deposit contiguous low bits from unsigned 32-bit integer a to dst at the corresponding bit locations specified by mask; all other bits in dst are set to zero.

Operation

tmp := a dst := 0 m := 0 k := 0 DO WHILE m < 32 IF mask[m] = 1 dst[m] := tmp[k] k := k + 1 FI m := m + 1 OD

Performance

ArchitectureLatencyThroughput
Haswell3-
pdep
unsigned __int64 _pdep_u64 (unsigned __int64 a, unsigned __int64 mask)

Synopsis

unsigned __int64 _pdep_u64 (unsigned __int64 a, unsigned __int64 mask)
#include "immintrin.h"
Instruction: pdep r64, r64, r64
CPUID Flags: BMI2

Description

Deposit contiguous low bits from unsigned 64-bit integer a to dst at the corresponding bit locations specified by mask; all other bits in dst are set to zero.

Operation

tmp := a dst := 0 m := 0 k := 0 DO WHILE m < 64 IF mask[m] = 1 dst[m] := tmp[k] k := k + 1 FI m := m + 1 OD

Performance

ArchitectureLatencyThroughput
Haswell3-
vpermilpd
__m128d _mm_mask_permute_pd (__m128d src, __mmask8 k, __m128d a, const int imm8)

Synopsis

__m128d _mm_mask_permute_pd (__m128d src, __mmask8 k, __m128d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0] IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64] IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0] IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpermilpd
__m128d _mm_maskz_permute_pd (__mmask8 k, __m128d a, const int imm8)

Synopsis

__m128d _mm_maskz_permute_pd (__mmask8 k, __m128d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0] IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64] IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0] IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpermilpd
__m128d _mm_permute_pd (__m128d a, int imm8)

Synopsis

__m128d _mm_permute_pd (__m128d a, int imm8)
#include "immintrin.h"
Instruction: vpermilpd xmm, xmm, imm
CPUID Flags: AVX

Description

Shuffle double-precision (64-bit) floating-point elements in a using the control in imm8, and store the results in dst.

Operation

IF (imm8[0] == 0) dst[63:0] := a[63:0] IF (imm8[0] == 1) dst[63:0] := a[127:64] IF (imm8[1] == 0) dst[127:64] := a[63:0] IF (imm8[1] == 1) dst[127:64] := a[127:64] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
vpermilpd
__m256d _mm256_mask_permute_pd (__m256d src, __mmask8 k, __m256d a, const int imm8)

Synopsis

__m256d _mm256_mask_permute_pd (__m256d src, __mmask8 k, __m256d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0] IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64] IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0] IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64] IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128] IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192] IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128] IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpermilpd
__m256d _mm256_maskz_permute_pd (__mmask8 k, __m256d a, const int imm8)

Synopsis

__m256d _mm256_maskz_permute_pd (__mmask8 k, __m256d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0] IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64] IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0] IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64] IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128] IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192] IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128] IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermilpd
__m256d _mm256_permute_pd (__m256d a, int imm8)

Synopsis

__m256d _mm256_permute_pd (__m256d a, int imm8)
#include "immintrin.h"
Instruction: vpermilpd ymm, ymm, imm
CPUID Flags: AVX

Description

Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.

Operation

IF (imm8[0] == 0) dst[63:0] := a[63:0] IF (imm8[0] == 1) dst[63:0] := a[127:64] IF (imm8[1] == 0) dst[127:64] := a[63:0] IF (imm8[1] == 1) dst[127:64] := a[127:64] IF (imm8[2] == 0) dst[191:128] := a[191:128] IF (imm8[2] == 1) dst[191:128] := a[255:192] IF (imm8[3] == 0) dst[255:192] := a[191:128] IF (imm8[3] == 1) dst[255:192] := a[255:192] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
vpermilpd
__m512d _mm512_mask_permute_pd (__m512d src, __mmask8 k, __m512d a, const int imm8)

Synopsis

__m512d _mm512_mask_permute_pd (__m512d src, __mmask8 k, __m512d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0] IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64] IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0] IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64] IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128] IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192] IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128] IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192] IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256] IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320] IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256] IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320] IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384] IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448] IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384] IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpermilpd
__m512d _mm512_maskz_permute_pd (__mmask8 k, __m512d a, const int imm8)

Synopsis

__m512d _mm512_maskz_permute_pd (__mmask8 k, __m512d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0] IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64] IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0] IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64] IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128] IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192] IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128] IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192] IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256] IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320] IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256] IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320] IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384] IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448] IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384] IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermilpd
__m512d _mm512_permute_pd (__m512d a, const int imm8)

Synopsis

__m512d _mm512_permute_pd (__m512d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.

Operation

IF (imm8[0] == 0) dst[63:0] := a[63:0] IF (imm8[0] == 1) dst[63:0] := a[127:64] IF (imm8[1] == 0) dst[127:64] := a[63:0] IF (imm8[1] == 1) dst[127:64] := a[127:64] IF (imm8[2] == 0) dst[191:128] := a[191:128] IF (imm8[2] == 1) dst[191:128] := a[255:192] IF (imm8[3] == 0) dst[255:192] := a[191:128] IF (imm8[3] == 1) dst[255:192] := a[255:192] IF (imm8[4] == 0) dst[319:256] := a[319:256] IF (imm8[4] == 1) dst[319:256] := a[383:320] IF (imm8[5] == 0) dst[383:320] := a[319:256] IF (imm8[5] == 1) dst[383:320] := a[383:320] IF (imm8[6] == 0) dst[447:384] := a[447:384] IF (imm8[6] == 1) dst[447:384] := a[511:448] IF (imm8[7] == 0) dst[511:448] := a[447:384] IF (imm8[7] == 1) dst[511:448] := a[511:448] dst[MAX:512] := 0
vpermilps
__m128 _mm_mask_permute_ps (__m128 src, __mmask8 k, __m128 a, const int imm8)

Synopsis

__m128 _mm_mask_permute_ps (__m128 src, __mmask8 k, __m128 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpermilps
__m128 _mm_maskz_permute_ps (__mmask8 k, __m128 a, const int imm8)

Synopsis

__m128 _mm_maskz_permute_ps (__mmask8 k, __m128 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpermilps
__m128 _mm_permute_ps (__m128 a, int imm8)

Synopsis

__m128 _mm_permute_ps (__m128 a, int imm8)
#include "immintrin.h"
Instruction: vpermilps xmm, xmm, imm
CPUID Flags: AVX

Description

Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], imm8[1:0]) dst[63:32] := SELECT4(a[127:0], imm8[3:2]) dst[95:64] := SELECT4(a[127:0], imm8[5:4]) dst[127:96] := SELECT4(a[127:0], imm8[7:6]) dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
vpermilps
__m256 _mm256_mask_permute_ps (__m256 src, __mmask8 k, __m256 a, const int imm8)

Synopsis

__m256 _mm256_mask_permute_ps (__m256 src, __mmask8 k, __m256 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpermilps
__m256 _mm256_maskz_permute_ps (__mmask8 k, __m256 a, const int imm8)

Synopsis

__m256 _mm256_maskz_permute_ps (__mmask8 k, __m256 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermilps
__m256 _mm256_permute_ps (__m256 a, int imm8)

Synopsis

__m256 _mm256_permute_ps (__m256 a, int imm8)
#include "immintrin.h"
Instruction: vpermilps ymm, ymm, imm
CPUID Flags: AVX

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], imm8[1:0]) dst[63:32] := SELECT4(a[127:0], imm8[3:2]) dst[95:64] := SELECT4(a[127:0], imm8[5:4]) dst[127:96] := SELECT4(a[127:0], imm8[7:6]) dst[159:128] := SELECT4(a[255:128], imm8[1:0]) dst[191:160] := SELECT4(a[255:128], imm8[3:2]) dst[223:192] := SELECT4(a[255:128], imm8[5:4]) dst[255:224] := SELECT4(a[255:128], imm8[7:6]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
vpermilps
__m512 _mm512_mask_permute_ps (__m512 src, __mmask16 k, __m512 a, const int imm8)

Synopsis

__m512 _mm512_mask_permute_ps (__m512 src, __mmask16 k, __m512 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpermilps
__m512 _mm512_maskz_permute_ps (__mmask16 k, __m512 a, const int imm8)

Synopsis

__m512 _mm512_maskz_permute_ps (__mmask16 k, __m512 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermilps
__m512 _mm512_permute_ps (__m512 a, const int imm8)

Synopsis

__m512 _mm512_permute_ps (__m512 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], imm8[1:0]) dst[63:32] := SELECT4(a[127:0], imm8[3:2]) dst[95:64] := SELECT4(a[127:0], imm8[5:4]) dst[127:96] := SELECT4(a[127:0], imm8[7:6]) dst[159:128] := SELECT4(a[255:128], imm8[1:0]) dst[191:160] := SELECT4(a[255:128], imm8[3:2]) dst[223:192] := SELECT4(a[255:128], imm8[5:4]) dst[255:224] := SELECT4(a[255:128], imm8[7:6]) dst[287:256] := SELECT4(a[383:256], imm8[1:0]) dst[319:288] := SELECT4(a[383:256], imm8[3:2]) dst[351:320] := SELECT4(a[383:256], imm8[5:4]) dst[383:352] := SELECT4(a[383:256], imm8[7:6]) dst[415:384] := SELECT4(a[511:384], imm8[1:0]) dst[447:416] := SELECT4(a[511:384], imm8[3:2]) dst[479:448] := SELECT4(a[511:384], imm8[5:4]) dst[511:480] := SELECT4(a[511:384], imm8[7:6]) dst[MAX:512] := 0
vperm2f128
__m256d _mm256_permute2f128_pd (__m256d a, __m256d b, int imm8)

Synopsis

__m256d _mm256_permute2f128_pd (__m256d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vperm2f128 ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.

Operation

SELECT4(src1, src2, control){ CASE(control[1:0]) 0: tmp[127:0] := src1[127:0] 1: tmp[127:0] := src1[255:128] 2: tmp[127:0] := src2[127:0] 3: tmp[127:0] := src2[255:128] ESAC IF control[3] tmp[127:0] := 0 FI RETURN tmp[127:0] } dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge11
Sandy Bridge11
vperm2f128
__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)

Synopsis

__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vperm2f128 ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Shuffle 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.

Operation

SELECT4(src1, src2, control){ CASE(control[1:0]) 0: tmp[127:0] := src1[127:0] 1: tmp[127:0] := src1[255:128] 2: tmp[127:0] := src2[127:0] 3: tmp[127:0] := src2[255:128] ESAC IF control[3] tmp[127:0] := 0 FI RETURN tmp[127:0] } dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge11
Sandy Bridge11
vperm2f128
__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)

Synopsis

__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vperm2f128 ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Shuffle 128-bits (composed of integer data) selected by imm8 from a and b, and store the results in dst.

Operation

SELECT4(src1, src2, control){ CASE(control[1:0]) 0: tmp[127:0] := src1[127:0] 1: tmp[127:0] := src1[255:128] 2: tmp[127:0] := src2[127:0] 3: tmp[127:0] := src2[255:128] ESAC IF control[3] tmp[127:0] := 0 FI RETURN tmp[127:0] } dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge11
Sandy Bridge11
vperm2i128
__m256i _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)

Synopsis

__m256i _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vperm2i128 ymm, ymm, ymm, imm
CPUID Flags: AVX2

Description

Shuffle 128-bits (composed of integer data) selected by imm8 from a and b, and store the results in dst.

Operation

SELECT4(src1, src2, control){ CASE(control[1:0]) 0: tmp[127:0] := src1[127:0] 1: tmp[127:0] := src1[255:128] 2: tmp[127:0] := src2[127:0] 3: tmp[127:0] := src2[255:128] ESAC IF control[3] tmp[127:0] := 0 FI RETURN tmp[127:0] } dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpermf32x4
__m512i _mm512_mask_permute4f128_epi32 (__m512i src, __mmask16 k, __m512i a, _MM_PERM_ENUM imm8)

Synopsis

__m512i _mm512_mask_permute4f128_epi32 (__m512i src, __mmask16 k, __m512i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpermf32x4 zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Permutes 128-bit blocks of the packed 32-bit integer vector a using constant imm8. The results are stored in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control) { CASE control[1:0] OF 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp[511:0] := 0 FOR j := 0 to 4 i := j*128 n := j*2 tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n]) ENDFOR FOR j := 0 to 15 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpermf32x4
__m512i _mm512_permute4f128_epi32 (__m512i a, _MM_PERM_ENUM imm8)

Synopsis

__m512i _mm512_permute4f128_epi32 (__m512i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpermf32x4 zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Permutes 128-bit blocks of the packed 32-bit integer vector a using constant imm8. The results are stored in dst.

Operation

SELECT4(src, control) { CASE control[1:0] OF 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } FOR j := 0 to 3 i := j*128 n := j*2 dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n]) ENDFOR dst[MAX:512] := 0
vpermf32x4
__m512 _mm512_mask_permute4f128_ps (__m512 src, __mmask16 k, __m512 a, _MM_PERM_ENUM imm8)

Synopsis

__m512 _mm512_mask_permute4f128_ps (__m512 src, __mmask16 k, __m512 a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpermf32x4 zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in a using constant imm8. The results are stored in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control) { CASE control[1:0] OF 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp[511:0] := 0 FOR j := 0 to 4 i := j*128 n := j*2 tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n]) ENDFOR FOR j := 0 to 15 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpermf32x4
__m512 _mm512_permute4f128_ps (__m512 a, _MM_PERM_ENUM imm8)

Synopsis

__m512 _mm512_permute4f128_ps (__m512 a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpermf32x4 zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in a using constant imm8. The results are stored in dst.

Operation

SELECT4(src, control) { CASE control[1:0] OF 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } FOR j := 0 to 3 i := j*128 n := j*2 dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n]) ENDFOR dst[MAX:512] := 0
vpermq
__m256i _mm256_permute4x64_epi64 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_permute4x64_epi64 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq ymm, ymm, imm
CPUID Flags: AVX2

Description

Shuffle 64-bit integers in a across lanes using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } dst[63:0] := SELECT4(a[255:0], imm8[1:0]) dst[127:64] := SELECT4(a[255:0], imm8[3:2]) dst[191:128] := SELECT4(a[255:0], imm8[5:4]) dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpermpd
__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)

Synopsis

__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
#include "immintrin.h"
Instruction: vpermpd ymm, ymm, imm
CPUID Flags: AVX2

Description

Shuffle double-precision (64-bit) floating-point elements in a across lanes using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } dst[63:0] := SELECT4(a[255:0], imm8[1:0]) dst[127:64] := SELECT4(a[255:0], imm8[3:2]) dst[191:128] := SELECT4(a[255:0], imm8[5:4]) dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
vpermd
__m512i _mm512_mask_permutevar_epi32 (__m512i src, __mmask16 k, __m512i idx, __m512i a)

Synopsis

__m512i _mm512_mask_permutevar_epi32 (__m512i src, __mmask16 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_mask_permutexvar_epi32, and it is recommended that you use that intrinsic name.

Operation

FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpermd
__m512i _mm512_permutevar_epi32 (__m512i idx, __m512i a)

Synopsis

__m512i _mm512_permutevar_epi32 (__m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_permutexvar_epi32, and it is recommended that you use that intrinsic name.

Operation

FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:512] := 0
vpermilpd
__m128d _mm_mask_permutevar_pd (__m128d src, __mmask8 k, __m128d a, __m128i b)

Synopsis

__m128d _mm_mask_permutevar_pd (__m128d src, __mmask8 k, __m128d a, __m128i b)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

IF (b[1] == 0) tmp_dst[63:0] := a[63:0] IF (b[1] == 1) tmp_dst[63:0] := a[127:64] IF (b[65] == 0) tmp_dst[127:64] := a[63:0] IF (b[65] == 1) tmp_dst[127:64] := a[127:64] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpermilpd
__m128d _mm_maskz_permutevar_pd (__mmask8 k, __m128d a, __m128i b)

Synopsis

__m128d _mm_maskz_permutevar_pd (__mmask8 k, __m128d a, __m128i b)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

IF (b[1] == 0) tmp_dst[63:0] := a[63:0] IF (b[1] == 1) tmp_dst[63:0] := a[127:64] IF (b[65] == 0) tmp_dst[127:64] := a[63:0] IF (b[65] == 1) tmp_dst[127:64] := a[127:64] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpermilpd
__m128d _mm_permutevar_pd (__m128d a, __m128i b)

Synopsis

__m128d _mm_permutevar_pd (__m128d a, __m128i b)
#include "immintrin.h"
Instruction: vpermilpd xmm, xmm, xmm
CPUID Flags: AVX

Description

Shuffle double-precision (64-bit) floating-point elements in a using the control in b, and store the results in dst.

Operation

IF (b[1] == 0) dst[63:0] := a[63:0] IF (b[1] == 1) dst[63:0] := a[127:64] IF (b[65] == 0) dst[127:64] := a[63:0] IF (b[65] == 1) dst[127:64] := a[127:64] dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
vpermilpd
__m256d _mm256_mask_permutevar_pd (__m256d src, __mmask8 k, __m256d a, __m256i b)

Synopsis

__m256d _mm256_mask_permutevar_pd (__m256d src, __mmask8 k, __m256d a, __m256i b)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

IF (b[1] == 0) tmp_dst[63:0] := a[63:0] IF (b[1] == 1) tmp_dst[63:0] := a[127:64] IF (b[65] == 0) tmp_dst[127:64] := a[63:0] IF (b[65] == 1) tmp_dst[127:64] := a[127:64] IF (b[129] == 0) tmp_dst[191:128] := a[191:128] IF (b[129] == 1) tmp_dst[191:128] := a[255:192] IF (b[193] == 0) tmp_dst[255:192] := a[191:128] IF (b[193] == 1) tmp_dst[255:192] := a[255:192] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpermilpd
__m256d _mm256_maskz_permutevar_pd (__mmask8 k, __m256d a, __m256i b)

Synopsis

__m256d _mm256_maskz_permutevar_pd (__mmask8 k, __m256d a, __m256i b)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

IF (b[1] == 0) tmp_dst[63:0] := a[63:0] IF (b[1] == 1) tmp_dst[63:0] := a[127:64] IF (b[65] == 0) tmp_dst[127:64] := a[63:0] IF (b[65] == 1) tmp_dst[127:64] := a[127:64] IF (b[129] == 0) tmp_dst[191:128] := a[191:128] IF (b[129] == 1) tmp_dst[191:128] := a[255:192] IF (b[193] == 0) tmp_dst[255:192] := a[191:128] IF (b[193] == 1) tmp_dst[255:192] := a[255:192] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermilpd
__m256d _mm256_permutevar_pd (__m256d a, __m256i b)

Synopsis

__m256d _mm256_permutevar_pd (__m256d a, __m256i b)
#include "immintrin.h"
Instruction: vpermilpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.

Operation

IF (b[1] == 0) dst[63:0] := a[63:0] IF (b[1] == 1) dst[63:0] := a[127:64] IF (b[65] == 0) dst[127:64] := a[63:0] IF (b[65] == 1) dst[127:64] := a[127:64] IF (b[129] == 0) dst[191:128] := a[191:128] IF (b[129] == 1) dst[191:128] := a[255:192] IF (b[193] == 0) dst[255:192] := a[191:128] IF (b[193] == 1) dst[255:192] := a[255:192] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vpermilpd
__m512d _mm512_mask_permutevar_pd (__m512d src, __mmask8 k, __m512d a, __m512i b)

Synopsis

__m512d _mm512_mask_permutevar_pd (__m512d src, __mmask8 k, __m512d a, __m512i b)
#include "immintrin.h"
Instruction: vpermilpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

IF (b[1] == 0) tmp_dst[63:0] := a[63:0] IF (b[1] == 1) tmp_dst[63:0] := a[127:64] IF (b[65] == 0) tmp_dst[127:64] := a[63:0] IF (b[65] == 1) tmp_dst[127:64] := a[127:64] IF (b[129] == 0) tmp_dst[191:128] := a[191:128] IF (b[129] == 1) tmp_dst[191:128] := a[255:192] IF (b[193] == 0) tmp_dst[255:192] := a[191:128] IF (b[193] == 1) tmp_dst[255:192] := a[255:192] IF (b[257] == 0) tmp_dst[319:256] := a[319:256] IF (b[257] == 1) tmp_dst[319:256] := a[383:320] IF (b[321] == 0) tmp_dst[383:320] := a[319:256] IF (b[321] == 1) tmp_dst[383:320] := a[383:320] IF (b[385] == 0) tmp_dst[447:384] := a[447:384] IF (b[385] == 1) tmp_dst[447:384] := a[511:448] IF (b[449] == 0) tmp_dst[511:448] := a[447:384] IF (b[449] == 1) tmp_dst[511:448] := a[511:448] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpermilpd
__m512d _mm512_maskz_permutevar_pd (__mmask8 k, __m512d a, __m512i b)

Synopsis

__m512d _mm512_maskz_permutevar_pd (__mmask8 k, __m512d a, __m512i b)
#include "immintrin.h"
Instruction: vpermilpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

IF (b[1] == 0) tmp_dst[63:0] := a[63:0] IF (b[1] == 1) tmp_dst[63:0] := a[127:64] IF (b[65] == 0) tmp_dst[127:64] := a[63:0] IF (b[65] == 1) tmp_dst[127:64] := a[127:64] IF (b[129] == 0) tmp_dst[191:128] := a[191:128] IF (b[129] == 1) tmp_dst[191:128] := a[255:192] IF (b[193] == 0) tmp_dst[255:192] := a[191:128] IF (b[193] == 1) tmp_dst[255:192] := a[255:192] IF (b[257] == 0) tmp_dst[319:256] := a[319:256] IF (b[257] == 1) tmp_dst[319:256] := a[383:320] IF (b[321] == 0) tmp_dst[383:320] := a[319:256] IF (b[321] == 1) tmp_dst[383:320] := a[383:320] IF (b[385] == 0) tmp_dst[447:384] := a[447:384] IF (b[385] == 1) tmp_dst[447:384] := a[511:448] IF (b[449] == 0) tmp_dst[511:448] := a[447:384] IF (b[449] == 1) tmp_dst[511:448] := a[511:448] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermilpd
__m512d _mm512_permutevar_pd (__m512d a, __m512i b)

Synopsis

__m512d _mm512_permutevar_pd (__m512d a, __m512i b)
#include "immintrin.h"
Instruction: vpermilpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.

Operation

IF (b[1] == 0) dst[63:0] := a[63:0] IF (b[1] == 1) dst[63:0] := a[127:64] IF (b[65] == 0) dst[127:64] := a[63:0] IF (b[65] == 1) dst[127:64] := a[127:64] IF (b[129] == 0) dst[191:128] := a[191:128] IF (b[129] == 1) dst[191:128] := a[255:192] IF (b[193] == 0) dst[255:192] := a[191:128] IF (b[193] == 1) dst[255:192] := a[255:192] IF (b[257] == 0) dst[319:256] := a[319:256] IF (b[257] == 1) dst[319:256] := a[383:320] IF (b[321] == 0) dst[383:320] := a[319:256] IF (b[321] == 1) dst[383:320] := a[383:320] IF (b[385] == 0) dst[447:384] := a[447:384] IF (b[385] == 1) dst[447:384] := a[511:448] IF (b[449] == 0) dst[511:448] := a[447:384] IF (b[449] == 1) dst[511:448] := a[511:448] dst[MAX:512] := 0
vpermilps
__m128 _mm_mask_permutevar_ps (__m128 src, __mmask8 k, __m128 a, __m128i b)

Synopsis

__m128 _mm_mask_permutevar_ps (__m128 src, __mmask8 k, __m128 a, __m128i b)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpermilps
__m128 _mm_maskz_permutevar_ps (__mmask8 k, __m128 a, __m128i b)

Synopsis

__m128 _mm_maskz_permutevar_ps (__mmask8 k, __m128 a, __m128i b)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpermilps
__m128 _mm_permutevar_ps (__m128 a, __m128i b)

Synopsis

__m128 _mm_permutevar_ps (__m128 a, __m128i b)
#include "immintrin.h"
Instruction: vpermilps xmm, xmm, xmm
CPUID Flags: AVX

Description

Shuffle single-precision (32-bit) floating-point elements in a using the control in b, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], b[1:0]) dst[63:32] := SELECT4(a[127:0], b[33:32]) dst[95:64] := SELECT4(a[127:0], b[65:64]) dst[127:96] := SELECT4(a[127:0], b[97:96]) dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
vpermilps
__m256 _mm256_mask_permutevar_ps (__m256 src, __mmask8 k, __m256 a, __m256i b)

Synopsis

__m256 _mm256_mask_permutevar_ps (__m256 src, __mmask8 k, __m256 a, __m256i b)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpermilps
__m256 _mm256_maskz_permutevar_ps (__mmask8 k, __m256 a, __m256i b)

Synopsis

__m256 _mm256_maskz_permutevar_ps (__mmask8 k, __m256 a, __m256i b)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermilps
__m256 _mm256_permutevar_ps (__m256 a, __m256i b)

Synopsis

__m256 _mm256_permutevar_ps (__m256 a, __m256i b)
#include "immintrin.h"
Instruction: vpermilps ymm, ymm, ymm
CPUID Flags: AVX

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], b[1:0]) dst[63:32] := SELECT4(a[127:0], b[33:32]) dst[95:64] := SELECT4(a[127:0], b[65:64]) dst[127:96] := SELECT4(a[127:0], b[97:96]) dst[159:128] := SELECT4(a[255:128], b[129:128]) dst[191:160] := SELECT4(a[255:128], b[161:160]) dst[223:192] := SELECT4(a[255:128], b[193:192]) dst[255:224] := SELECT4(a[255:128], b[225:224]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vpermilps
__m512 _mm512_mask_permutevar_ps (__m512 src, __mmask16 k, __m512 a, __m512i b)

Synopsis

__m512 _mm512_mask_permutevar_ps (__m512 src, __mmask16 k, __m512 a, __m512i b)
#include "immintrin.h"
Instruction: vpermilps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) tmp_dst[287:256] := SELECT4(a[383:256], b[257:256]) tmp_dst[319:288] := SELECT4(a[383:256], b[289:288]) tmp_dst[351:320] := SELECT4(a[383:256], b[321:320]) tmp_dst[383:352] := SELECT4(a[383:256], b[353:352]) tmp_dst[415:384] := SELECT4(a[511:384], b[385:384]) tmp_dst[447:416] := SELECT4(a[511:384], b[417:416]) tmp_dst[479:448] := SELECT4(a[511:384], b[449:448]) tmp_dst[511:480] := SELECT4(a[511:384], b[481:480]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpermilps
__m512 _mm512_maskz_permutevar_ps (__mmask16 k, __m512 a, __m512i b)

Synopsis

__m512 _mm512_maskz_permutevar_ps (__mmask16 k, __m512 a, __m512i b)
#include "immintrin.h"
Instruction: vpermilps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) tmp_dst[287:256] := SELECT4(a[383:256], b[257:256]) tmp_dst[319:288] := SELECT4(a[383:256], b[289:288]) tmp_dst[351:320] := SELECT4(a[383:256], b[321:320]) tmp_dst[383:352] := SELECT4(a[383:256], b[353:352]) tmp_dst[415:384] := SELECT4(a[511:384], b[385:384]) tmp_dst[447:416] := SELECT4(a[511:384], b[417:416]) tmp_dst[479:448] := SELECT4(a[511:384], b[449:448]) tmp_dst[511:480] := SELECT4(a[511:384], b[481:480]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermilps
__m512 _mm512_permutevar_ps (__m512 a, __m512i b)

Synopsis

__m512 _mm512_permutevar_ps (__m512 a, __m512i b)
#include "immintrin.h"
Instruction: vpermilps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], b[1:0]) dst[63:32] := SELECT4(a[127:0], b[33:32]) dst[95:64] := SELECT4(a[127:0], b[65:64]) dst[127:96] := SELECT4(a[127:0], b[97:96]) dst[159:128] := SELECT4(a[255:128], b[129:128]) dst[191:160] := SELECT4(a[255:128], b[161:160]) dst[223:192] := SELECT4(a[255:128], b[193:192]) dst[255:224] := SELECT4(a[255:128], b[225:224]) dst[287:256] := SELECT4(a[383:256], b[257:256]) dst[319:288] := SELECT4(a[383:256], b[289:288]) dst[351:320] := SELECT4(a[383:256], b[321:320]) dst[383:352] := SELECT4(a[383:256], b[353:352]) dst[415:384] := SELECT4(a[511:384], b[385:384]) dst[447:416] := SELECT4(a[511:384], b[417:416]) dst[479:448] := SELECT4(a[511:384], b[449:448]) dst[511:480] := SELECT4(a[511:384], b[481:480]) dst[MAX:512] := 0
vpermd
__m256i _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx)

Synopsis

__m256i _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx)
#include "immintrin.h"
Instruction: vpermd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
vpermps
__m256 _mm256_permutevar8x32_ps (__m256 a, __m256i idx)

Synopsis

__m256 _mm256_permutevar8x32_ps (__m256 a, __m256i idx)
#include "immintrin.h"
Instruction: vpermps ymm, ymm, ymm
CPUID Flags: AVX2

Description

Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.

Operation

FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
vpermq
__m256i _mm256_mask_permutex_epi64 (__m256i src, __mmask8 k, __m256i a, const int imm8)

Synopsis

__m256i _mm256_mask_permutex_epi64 (__m256i src, __mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a across lanes lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpermq
__m256i _mm256_maskz_permutex_epi64 (__mmask8 k, __m256i a, const int imm8)

Synopsis

__m256i _mm256_maskz_permutex_epi64 (__mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a across lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermq
__m256i _mm256_permutex_epi64 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_permutex_epi64 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a across lanes using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } dst[63:0] := SELECT4(a[255:0], imm8[1:0]) dst[127:64] := SELECT4(a[255:0], imm8[3:2]) dst[191:128] := SELECT4(a[255:0], imm8[5:4]) dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[MAX:256] := 0
vpermq
__m512i _mm512_mask_permutex_epi64 (__m512i src, __mmask8 k, __m512i a, const int imm8)

Synopsis

__m512i _mm512_mask_permutex_epi64 (__m512i src, __mmask8 k, __m512i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpermq
__m512i _mm512_maskz_permutex_epi64 (__mmask8 k, __m512i a, const int imm8)

Synopsis

__m512i _mm512_maskz_permutex_epi64 (__mmask8 k, __m512i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermq
__m512i _mm512_permutex_epi64 (__m512i a, const int imm8)

Synopsis

__m512i _mm512_permutex_epi64 (__m512i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } dst[63:0] := SELECT4(a[255:0], imm8[1:0]) dst[127:64] := SELECT4(a[255:0], imm8[3:2]) dst[191:128] := SELECT4(a[255:0], imm8[5:4]) dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[319:256] := SELECT4(a[511:256], imm8[1:0]) dst[383:320] := SELECT4(a[511:256], imm8[3:2]) dst[447:384] := SELECT4(a[511:256], imm8[5:4]) dst[511:448] := SELECT4(a[511:256], imm8[7:6]) dst[MAX:512] := 0
vpermpd
__m256d _mm256_mask_permutex_pd (__m256d src, __mmask8 k, __m256d a, int imm8)

Synopsis

__m256d _mm256_mask_permutex_pd (__m256d src, __mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vpermpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a across lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpermpd
__m256d _mm256_maskz_permutex_pd (__mmask8 k, __m256d a, int imm8)

Synopsis

__m256d _mm256_maskz_permutex_pd (__mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vpermpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a across lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermpd
__m256d _mm256_permutex_pd (__m256d a, int imm8)

Synopsis

__m256d _mm256_permutex_pd (__m256d a, int imm8)
#include "immintrin.h"
Instruction: vpermpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a across lanes using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } dst[63:0] := SELECT4(a[255:0], imm8[1:0]) dst[127:64] := SELECT4(a[255:0], imm8[3:2]) dst[191:128] := SELECT4(a[255:0], imm8[5:4]) dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[MAX:256] := 0
vpermpd
__m512d _mm512_mask_permutex_pd (__m512d src, __mmask8 k, __m512d a, const int imm8)

Synopsis

__m512d _mm512_mask_permutex_pd (__m512d src, __mmask8 k, __m512d a, const int imm8)
#include "immintrin.h"
Instruction: vpermpd zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpermpd
__m512d _mm512_maskz_permutex_pd (__mmask8 k, __m512d a, const int imm8)

Synopsis

__m512d _mm512_maskz_permutex_pd (__mmask8 k, __m512d a, const int imm8)
#include "immintrin.h"
Instruction: vpermpd zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermpd
__m512d _mm512_permutex_pd (__m512d a, const int imm8)

Synopsis

__m512d _mm512_permutex_pd (__m512d a, const int imm8)
#include "immintrin.h"
Instruction: vpermpd zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } dst[63:0] := SELECT4(a[255:0], imm8[1:0]) dst[127:64] := SELECT4(a[255:0], imm8[3:2]) dst[191:128] := SELECT4(a[255:0], imm8[5:4]) dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[319:256] := SELECT4(a[511:256], imm8[1:0]) dst[383:320] := SELECT4(a[511:256], imm8[3:2]) dst[447:384] := SELECT4(a[511:256], imm8[5:4]) dst[511:448] := SELECT4(a[511:256], imm8[7:6]) dst[MAX:512] := 0
vpermt2w
__m128i _mm_mask_permutex2var_epi16 (__m128i a, __mmask8 k, __m128i idx, __m128i b)

Synopsis

__m128i _mm_mask_permutex2var_epi16 (__m128i a, __mmask8 k, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermt2w
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] off := 16*idx[i+2:i] dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] ELSE dst[i+15:i] := a[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpermi2w
__m128i _mm_mask2_permutex2var_epi16 (__m128i a, __m128i idx, __mmask8 k, __m128i b)

Synopsis

__m128i _mm_mask2_permutex2var_epi16 (__m128i a, __m128i idx, __mmask8 k, __m128i b)
#include "immintrin.h"
Instruction: vpermi2w
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] off := 16*idx[i+2:i] dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] ELSE dst[i+15:i] := idx[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpermi2w, vpermt2w
__m128i _mm_maskz_permutex2var_epi16 (__mmask8 k, __m128i a, __m128i idx, __m128i b)

Synopsis

__m128i _mm_maskz_permutex2var_epi16 (__mmask8 k, __m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2w
             vpermt2w
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] off := 16*idx[i+2:i] dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpermi2w, vpermt2w
__m128i _mm_permutex2var_epi16 (__m128i a, __m128i idx, __m128i b)

Synopsis

__m128i _mm_permutex2var_epi16 (__m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2w
             vpermt2w
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 off := 16*idx[i+2:i] dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] ENDFOR dst[MAX:128] := 0
vpermt2w
__m256i _mm256_mask_permutex2var_epi16 (__m256i a, __mmask16 k, __m256i idx, __m256i b)

Synopsis

__m256i _mm256_mask_permutex2var_epi16 (__m256i a, __mmask16 k, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermt2w
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] off := 16*idx[i+3:i] dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] ELSE dst[i+15:i] := a[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpermi2w
__m256i _mm256_mask2_permutex2var_epi16 (__m256i a, __m256i idx, __mmask16 k, __m256i b)

Synopsis

__m256i _mm256_mask2_permutex2var_epi16 (__m256i a, __m256i idx, __mmask16 k, __m256i b)
#include "immintrin.h"
Instruction: vpermi2w
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] off := 16*idx[i+3:i] dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] ELSE dst[i+15:i] := idx[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpermi2w, vpermt2w
__m256i _mm256_maskz_permutex2var_epi16 (__mmask16 k, __m256i a, __m256i idx, __m256i b)

Synopsis

__m256i _mm256_maskz_permutex2var_epi16 (__mmask16 k, __m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2w
             vpermt2w
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] off := 16*idx[i+3:i] dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermi2w, vpermt2w
__m256i _mm256_permutex2var_epi16 (__m256i a, __m256i idx, __m256i b)

Synopsis

__m256i _mm256_permutex2var_epi16 (__m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2w
             vpermt2w
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 off := 16*idx[i+3:i] dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] ENDFOR dst[MAX:256] := 0
vpermt2w
__m512i _mm512_mask_permutex2var_epi16 (__m512i a, __mmask32 k, __m512i idx, __m512i b)

Synopsis

__m512i _mm512_mask_permutex2var_epi16 (__m512i a, __mmask32 k, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermt2w
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] off := 16*idx[i+4:i] dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] ELSE dst[i+15:i] := a[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpermi2w
__m512i _mm512_mask2_permutex2var_epi16 (__m512i a, __m512i idx, __mmask32 k, __m512i b)

Synopsis

__m512i _mm512_mask2_permutex2var_epi16 (__m512i a, __m512i idx, __mmask32 k, __m512i b)
#include "immintrin.h"
Instruction: vpermi2w
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] off := 16*idx[i+4:i] dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] ELSE dst[i+15:i] := idx[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpermi2w, vpermt2w
__m512i _mm512_maskz_permutex2var_epi16 (__mmask32 k, __m512i a, __m512i idx, __m512i b)

Synopsis

__m512i _mm512_maskz_permutex2var_epi16 (__mmask32 k, __m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2w
             vpermt2w
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] off := 16*idx[i+4:i] dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermi2w, vpermt2w
__m512i _mm512_permutex2var_epi16 (__m512i a, __m512i idx, __m512i b)

Synopsis

__m512i _mm512_permutex2var_epi16 (__m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2w
             vpermt2w
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 off := 16*idx[i+4:i] dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] ENDFOR dst[MAX:512] := 0
vpermt2d
__m128i _mm_mask_permutex2var_epi32 (__m128i a, __mmask8 k, __m128i idx, __m128i b)

Synopsis

__m128i _mm_mask_permutex2var_epi32 (__m128i a, __mmask8 k, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermt2d
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 IF k[j] dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpermi2d
__m128i _mm_mask2_permutex2var_epi32 (__m128i a, __m128i idx, __mmask8 k, __m128i b)

Synopsis

__m128i _mm_mask2_permutex2var_epi32 (__m128i a, __m128i idx, __mmask8 k, __m128i b)
#include "immintrin.h"
Instruction: vpermi2d
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 IF k[j] dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := idx[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpermi2d, vpermt2d
__m128i _mm_maskz_permutex2var_epi32 (__mmask8 k, __m128i a, __m128i idx, __m128i b)

Synopsis

__m128i _mm_maskz_permutex2var_epi32 (__mmask8 k, __m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2d
             vpermt2d
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 IF k[j] dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpermi2d, vpermt2d
__m128i _mm_permutex2var_epi32 (__m128i a, __m128i idx, __m128i b)

Synopsis

__m128i _mm_permutex2var_epi32 (__m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2d
             vpermt2d
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 off := idx[i+2:i]*32 dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ENDFOR dst[MAX:128] := 0
vpermt2d
__m256i _mm256_mask_permutex2var_epi32 (__m256i a, __mmask8 k, __m256i idx, __m256i b)

Synopsis

__m256i _mm256_mask_permutex2var_epi32 (__m256i a, __mmask8 k, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermt2d
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpermi2d
__m256i _mm256_mask2_permutex2var_epi32 (__m256i a, __m256i idx, __mmask8 k, __m256i b)

Synopsis

__m256i _mm256_mask2_permutex2var_epi32 (__m256i a, __m256i idx, __mmask8 k, __m256i b)
#include "immintrin.h"
Instruction: vpermi2d
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 IF k[j] dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := idx[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpermi2d, vpermt2d
__m256i _mm256_maskz_permutex2var_epi32 (__mmask8 k, __m256i a, __m256i idx, __m256i b)

Synopsis

__m256i _mm256_maskz_permutex2var_epi32 (__mmask8 k, __m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2d
             vpermt2d
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 IF k[j] dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermi2d, vpermt2d
__m256i _mm256_permutex2var_epi32 (__m256i a, __m256i idx, __m256i b)

Synopsis

__m256i _mm256_permutex2var_epi32 (__m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2d
             vpermt2d
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ENDFOR dst[MAX:256] := 0
vpermt2d
__m512i _mm512_mask_permutex2var_epi32 (__m512i a, __mmask16 k, __m512i idx, __m512i b)

Synopsis

__m512i _mm512_mask_permutex2var_epi32 (__m512i a, __mmask16 k, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermt2d zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpermi2d
__m512i _mm512_mask2_permutex2var_epi32 (__m512i a, __m512i idx, __mmask16 k, __m512i b)

Synopsis

__m512i _mm512_mask2_permutex2var_epi32 (__m512i a, __m512i idx, __mmask16 k, __m512i b)
#include "immintrin.h"
Instruction: vpermi2d zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := idx[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpermi2d, vpermt2d
__m512i _mm512_maskz_permutex2var_epi32 (__mmask16 k, __m512i a, __m512i idx, __m512i b)

Synopsis

__m512i _mm512_maskz_permutex2var_epi32 (__mmask16 k, __m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2d zmm {k}, zmm, zmm
             vpermt2d zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermi2d, vpermt2d
__m512i _mm512_permutex2var_epi32 (__m512i a, __m512i idx, __m512i b)

Synopsis

__m512i _mm512_permutex2var_epi32 (__m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2d zmm {k}, zmm, zmm
             vpermt2d zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ENDFOR dst[MAX:512] := 0
vpermt2q
__m128i _mm_mask_permutex2var_epi64 (__m128i a, __mmask8 k, __m128i idx, __m128i b)

Synopsis

__m128i _mm_mask_permutex2var_epi64 (__m128i a, __mmask8 k, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermt2q
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 off := idx[i]*64 IF k[j] dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpermi2q
__m128i _mm_mask2_permutex2var_epi64 (__m128i a, __m128i idx, __mmask8 k, __m128i b)

Synopsis

__m128i _mm_mask2_permutex2var_epi64 (__m128i a, __m128i idx, __mmask8 k, __m128i b)
#include "immintrin.h"
Instruction: vpermi2q
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 off := idx[i]*64 IF k[j] dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := idx[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpermi2q, vpermt2q
__m128i _mm_maskz_permutex2var_epi64 (__mmask8 k, __m128i a, __m128i idx, __m128i b)

Synopsis

__m128i _mm_maskz_permutex2var_epi64 (__mmask8 k, __m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2q
             vpermt2q
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 off := idx[i]*64 IF k[j] dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpermi2q, vpermt2q
__m128i _mm_permutex2var_epi64 (__m128i a, __m128i idx, __m128i b)

Synopsis

__m128i _mm_permutex2var_epi64 (__m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2q
             vpermt2q
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 off := idx[i]*64 dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ENDFOR dst[MAX:128] := 0
vpermt2q
__m256i _mm256_mask_permutex2var_epi64 (__m256i a, __mmask8 k, __m256i idx, __m256i b)

Synopsis

__m256i _mm256_mask_permutex2var_epi64 (__m256i a, __mmask8 k, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermt2q
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 IF k[j] dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpermi2q
__m256i _mm256_mask2_permutex2var_epi64 (__m256i a, __m256i idx, __mmask8 k, __m256i b)

Synopsis

__m256i _mm256_mask2_permutex2var_epi64 (__m256i a, __m256i idx, __mmask8 k, __m256i b)
#include "immintrin.h"
Instruction: vpermi2q
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 IF k[j] dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := idx[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpermi2q, vpermt2q
__m256i _mm256_maskz_permutex2var_epi64 (__mmask8 k, __m256i a, __m256i idx, __m256i b)

Synopsis

__m256i _mm256_maskz_permutex2var_epi64 (__mmask8 k, __m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2q
             vpermt2q
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 IF k[j] dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermi2q, vpermt2q
__m256i _mm256_permutex2var_epi64 (__m256i a, __m256i idx, __m256i b)

Synopsis

__m256i _mm256_permutex2var_epi64 (__m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2q
             vpermt2q
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ENDFOR dst[MAX:256] := 0
vpermt2q
__m512i _mm512_mask_permutex2var_epi64 (__m512i a, __mmask8 k, __m512i idx, __m512i b)

Synopsis

__m512i _mm512_mask_permutex2var_epi64 (__m512i a, __mmask8 k, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermt2q zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 IF k[j] dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpermi2q
__m512i _mm512_mask2_permutex2var_epi64 (__m512i a, __m512i idx, __mmask8 k, __m512i b)

Synopsis

__m512i _mm512_mask2_permutex2var_epi64 (__m512i a, __m512i idx, __mmask8 k, __m512i b)
#include "immintrin.h"
Instruction: vpermi2q zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 IF k[j] dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := idx[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpermi2q, vpermt2q
__m512i _mm512_maskz_permutex2var_epi64 (__mmask8 k, __m512i a, __m512i idx, __m512i b)

Synopsis

__m512i _mm512_maskz_permutex2var_epi64 (__mmask8 k, __m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2q zmm {k}, zmm, zmm
             vpermt2q zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 IF k[j] dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermi2q, vpermt2q
__m512i _mm512_permutex2var_epi64 (__m512i a, __m512i idx, __m512i b)

Synopsis

__m512i _mm512_permutex2var_epi64 (__m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2q zmm {k}, zmm, zmm
             vpermt2q zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ENDFOR dst[MAX:512] := 0
vpermt2b
__m128i _mm_mask_permutex2var_epi8 (__m128i a, __mmask16 k, __m128i idx, __m128i b)

Synopsis

__m128i _mm_mask_permutex2var_epi8 (__m128i a, __mmask16 k, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermt2b
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] off := 8*idx[i+3:i] dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpermi2b
__m128i _mm_mask2_permutex2var_epi8 (__m128i a, __m128i idx, __mmask16 k, __m128i b)

Synopsis

__m128i _mm_mask2_permutex2var_epi8 (__m128i a, __m128i idx, __mmask16 k, __m128i b)
#include "immintrin.h"
Instruction: vpermi2b
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] off := 8*idx[i+3:i] dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpermi2b, vpermt2b
__m128i _mm_maskz_permutex2var_epi8 (__mmask16 k, __m128i a, __m128i idx, __m128i b)

Synopsis

__m128i _mm_maskz_permutex2var_epi8 (__mmask16 k, __m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2b
             vpermt2b
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] off := 8*idx[i+3:i] dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpermi2b
__m128i _mm_permutex2var_epi8 (__m128i a, __m128i idx, __m128i b)

Synopsis

__m128i _mm_permutex2var_epi8 (__m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2b
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 off := 8*idx[i+3:i] dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] ENDFOR dst[MAX:128] := 0
vpermt2b
__m256i _mm256_mask_permutex2var_epi8 (__m256i a, __mmask32 k, __m256i idx, __m256i b)

Synopsis

__m256i _mm256_mask_permutex2var_epi8 (__m256i a, __mmask32 k, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermt2b
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] off := 8*idx[i+4:i] dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpermi2b
__m256i _mm256_mask2_permutex2var_epi8 (__m256i a, __m256i idx, __mmask32 k, __m256i b)

Synopsis

__m256i _mm256_mask2_permutex2var_epi8 (__m256i a, __m256i idx, __mmask32 k, __m256i b)
#include "immintrin.h"
Instruction: vpermi2b
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] off := 8*idx[i+4:i] dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpermi2b, vpermt2b
__m256i _mm256_maskz_permutex2var_epi8 (__mmask32 k, __m256i a, __m256i idx, __m256i b)

Synopsis

__m256i _mm256_maskz_permutex2var_epi8 (__mmask32 k, __m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2b
             vpermt2b
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] off := 8*idx[i+4:i] dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermi2b
__m256i _mm256_permutex2var_epi8 (__m256i a, __m256i idx, __m256i b)

Synopsis

__m256i _mm256_permutex2var_epi8 (__m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2b
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 off := 8*idx[i+4:i] dst[i+7:i] := idx[i+6] ? b[off+5:off] : a[off+7:off] ENDFOR dst[MAX:256] := 0
vpermt2b
__m512i _mm512_mask_permutex2var_epi8 (__m512i a, __mmask64 k, __m512i idx, __m512i b)

Synopsis

__m512i _mm512_mask_permutex2var_epi8 (__m512i a, __mmask64 k, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermt2b
CPUID Flags: AVX512VBMI

Description

Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] off := 8*idx[i+5:i] dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpermi2b
__m512i _mm512_mask2_permutex2var_epi8 (__m512i a, __m512i idx, __mmask64 k, __m512i b)

Synopsis

__m512i _mm512_mask2_permutex2var_epi8 (__m512i a, __m512i idx, __mmask64 k, __m512i b)
#include "immintrin.h"
Instruction: vpermi2b
CPUID Flags: AVX512VBMI

Description

Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] off := 8*idx[i+5:i] dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpermi2b, vpermt2b
__m512i _mm512_maskz_permutex2var_epi8 (__mmask64 k, __m512i a, __m512i idx, __m512i b)

Synopsis

__m512i _mm512_maskz_permutex2var_epi8 (__mmask64 k, __m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2b
             vpermt2b
CPUID Flags: AVX512VBMI

Description

Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] off := 8*idx[i+5:i] dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermi2b
__m512i _mm512_permutex2var_epi8 (__m512i a, __m512i idx, __m512i b)

Synopsis

__m512i _mm512_permutex2var_epi8 (__m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2b
CPUID Flags: AVX512VBMI

Description

Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 63 i := j*8 off := 8*idx[i+5:i] dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] ENDFOR dst[MAX:512] := 0
vpermt2pd
__m128d _mm_mask_permutex2var_pd (__m128d a, __mmask8 k, __m128i idx, __m128d b)

Synopsis

__m128d _mm_mask_permutex2var_pd (__m128d a, __mmask8 k, __m128i idx, __m128d b)
#include "immintrin.h"
Instruction: vpermt2pd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 off := idx[i]*64 IF k[j] dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpermi2pd
__m128d _mm_mask2_permutex2var_pd (__m128d a, __m128i idx, __mmask8 k, __m128d b)

Synopsis

__m128d _mm_mask2_permutex2var_pd (__m128d a, __m128i idx, __mmask8 k, __m128d b)
#include "immintrin.h"
Instruction: vpermi2pd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)

Operation

FOR j := 0 to 1 i := j*64 off := idx[i]*64 IF k[j] dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := idx[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpermi2pd, vpermt2pd
__m128d _mm_maskz_permutex2var_pd (__mmask8 k, __m128d a, __m128i idx, __m128d b)

Synopsis

__m128d _mm_maskz_permutex2var_pd (__mmask8 k, __m128d a, __m128i idx, __m128d b)
#include "immintrin.h"
Instruction: vpermi2pd
             vpermt2pd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 off := idx[i]*64 IF k[j] dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpermi2pd, vpermt2pd
__m128d _mm_permutex2var_pd (__m128d a, __m128i idx, __m128d b)

Synopsis

__m128d _mm_permutex2var_pd (__m128d a, __m128i idx, __m128d b)
#include "immintrin.h"
Instruction: vpermi2pd
             vpermt2pd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 off := idx[i]*64 dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ENDFOR dst[MAX:128] := 0
vpermt2pd
__m256d _mm256_mask_permutex2var_pd (__m256d a, __mmask8 k, __m256i idx, __m256d b)

Synopsis

__m256d _mm256_mask_permutex2var_pd (__m256d a, __mmask8 k, __m256i idx, __m256d b)
#include "immintrin.h"
Instruction: vpermt2pd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 IF k[j] dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpermi2pd
__m256d _mm256_mask2_permutex2var_pd (__m256d a, __m256i idx, __mmask8 k, __m256d b)

Synopsis

__m256d _mm256_mask2_permutex2var_pd (__m256d a, __m256i idx, __mmask8 k, __m256d b)
#include "immintrin.h"
Instruction: vpermi2pd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 IF k[j] dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := idx[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpermi2pd, vpermt2pd
__m256d _mm256_maskz_permutex2var_pd (__mmask8 k, __m256d a, __m256i idx, __m256d b)

Synopsis

__m256d _mm256_maskz_permutex2var_pd (__mmask8 k, __m256d a, __m256i idx, __m256d b)
#include "immintrin.h"
Instruction: vpermi2pd
             vpermt2pd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 IF k[j] dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermi2pd, vpermt2pd
__m256d _mm256_permutex2var_pd (__m256d a, __m256i idx, __m256d b)

Synopsis

__m256d _mm256_permutex2var_pd (__m256d a, __m256i idx, __m256d b)
#include "immintrin.h"
Instruction: vpermi2pd
             vpermt2pd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ENDFOR dst[MAX:256] := 0
vpermt2pd
__m512d _mm512_mask_permutex2var_pd (__m512d a, __mmask8 k, __m512i idx, __m512d b)

Synopsis

__m512d _mm512_mask_permutex2var_pd (__m512d a, __mmask8 k, __m512i idx, __m512d b)
#include "immintrin.h"
Instruction: vpermt2pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 IF k[j] dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpermi2pd
__m512d _mm512_mask2_permutex2var_pd (__m512d a, __m512i idx, __mmask8 k, __m512d b)

Synopsis

__m512d _mm512_mask2_permutex2var_pd (__m512d a, __m512i idx, __mmask8 k, __m512d b)
#include "immintrin.h"
Instruction: vpermi2pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)

Operation

FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 IF k[j] dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := idx[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpermi2pd, vpermt2pd
__m512d _mm512_maskz_permutex2var_pd (__mmask8 k, __m512d a, __m512i idx, __m512d b)

Synopsis

__m512d _mm512_maskz_permutex2var_pd (__mmask8 k, __m512d a, __m512i idx, __m512d b)
#include "immintrin.h"
Instruction: vpermi2pd zmm {k}, zmm, zmm
             vpermt2pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 IF k[j] dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermi2pd, vpermt2pd
__m512d _mm512_permutex2var_pd (__m512d a, __m512i idx, __m512d b)

Synopsis

__m512d _mm512_permutex2var_pd (__m512d a, __m512i idx, __m512d b)
#include "immintrin.h"
Instruction: vpermi2pd zmm {k}, zmm, zmm
             vpermt2pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ENDFOR dst[MAX:512] := 0
vpermt2ps
__m128 _mm_mask_permutex2var_ps (__m128 a, __mmask8 k, __m128i idx, __m128 b)

Synopsis

__m128 _mm_mask_permutex2var_ps (__m128 a, __mmask8 k, __m128i idx, __m128 b)
#include "immintrin.h"
Instruction: vpermt2ps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 IF k[j] dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpermi2ps
__m128 _mm_mask2_permutex2var_ps (__m128 a, __m128i idx, __mmask8 k, __m128 b)

Synopsis

__m128 _mm_mask2_permutex2var_ps (__m128 a, __m128i idx, __mmask8 k, __m128 b)
#include "immintrin.h"
Instruction: vpermi2ps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 IF k[j] dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := idx[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpermi2ps, vpermt2ps
__m128 _mm_maskz_permutex2var_ps (__mmask8 k, __m128 a, __m128i idx, __m128 b)

Synopsis

__m128 _mm_maskz_permutex2var_ps (__mmask8 k, __m128 a, __m128i idx, __m128 b)
#include "immintrin.h"
Instruction: vpermi2ps
             vpermt2ps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 IF k[j] dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpermi2ps, vpermt2ps
__m128 _mm_permutex2var_ps (__m128 a, __m128i idx, __m128 b)

Synopsis

__m128 _mm_permutex2var_ps (__m128 a, __m128i idx, __m128 b)
#include "immintrin.h"
Instruction: vpermi2ps
             vpermt2ps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] ENDFOR dst[MAX:128] := 0
vpermt2ps
__m256 _mm256_mask_permutex2var_ps (__m256 a, __mmask8 k, __m256i idx, __m256 b)

Synopsis

__m256 _mm256_mask_permutex2var_ps (__m256 a, __mmask8 k, __m256i idx, __m256 b)
#include "immintrin.h"
Instruction: vpermt2ps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 IF k[j] dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpermi2ps
__m256 _mm256_mask2_permutex2var_ps (__m256 a, __m256i idx, __mmask8 k, __m256 b)

Synopsis

__m256 _mm256_mask2_permutex2var_ps (__m256 a, __m256i idx, __mmask8 k, __m256 b)
#include "immintrin.h"
Instruction: vpermi2ps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 IF k[j] dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := idx[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpermi2ps, vpermt2ps
__m256 _mm256_maskz_permutex2var_ps (__mmask8 k, __m256 a, __m256i idx, __m256 b)

Synopsis

__m256 _mm256_maskz_permutex2var_ps (__mmask8 k, __m256 a, __m256i idx, __m256 b)
#include "immintrin.h"
Instruction: vpermi2ps
             vpermt2ps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 IF k[j] dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermi2ps, vpermt2ps
__m256 _mm256_permutex2var_ps (__m256 a, __m256i idx, __m256 b)

Synopsis

__m256 _mm256_permutex2var_ps (__m256 a, __m256i idx, __m256 b)
#include "immintrin.h"
Instruction: vpermi2ps
             vpermt2ps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ENDFOR dst[MAX:256] := 0
vpermt2ps
__m512 _mm512_mask_permutex2var_ps (__m512 a, __mmask16 k, __m512i idx, __m512 b)

Synopsis

__m512 _mm512_mask_permutex2var_ps (__m512 a, __mmask16 k, __m512i idx, __m512 b)
#include "immintrin.h"
Instruction: vpermt2ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpermi2ps
__m512 _mm512_mask2_permutex2var_ps (__m512 a, __m512i idx, __mmask16 k, __m512 b)

Synopsis

__m512 _mm512_mask2_permutex2var_ps (__m512 a, __m512i idx, __mmask16 k, __m512 b)
#include "immintrin.h"
Instruction: vpermi2ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := idx[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpermi2ps, vpermt2ps
__m512 _mm512_maskz_permutex2var_ps (__mmask16 k, __m512 a, __m512i idx, __m512 b)

Synopsis

__m512 _mm512_maskz_permutex2var_ps (__mmask16 k, __m512 a, __m512i idx, __m512 b)
#include "immintrin.h"
Instruction: vpermi2ps zmm {k}, zmm, zmm
             vpermt2ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermi2ps, vpermt2ps
__m512 _mm512_permutex2var_ps (__m512 a, __m512i idx, __m512 b)

Synopsis

__m512 _mm512_permutex2var_ps (__m512 a, __m512i idx, __m512 b)
#include "immintrin.h"
Instruction: vpermi2ps zmm {k}, zmm, zmm
             vpermt2ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ENDFOR dst[MAX:512] := 0
vpermw
__m128i _mm_mask_permutexvar_epi16 (__m128i src, __mmask8 k, __m128i idx, __m128i a)

Synopsis

__m128i _mm_mask_permutexvar_epi16 (__m128i src, __mmask8 k, __m128i idx, __m128i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 id := idx[i+2:i]*16 IF k[j] dst[i+15:i] := a[id+15:id] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpermw
__m128i _mm_maskz_permutexvar_epi16 (__mmask8 k, __m128i idx, __m128i a)

Synopsis

__m128i _mm_maskz_permutexvar_epi16 (__mmask8 k, __m128i idx, __m128i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 id := idx[i+2:i]*16 IF k[j] dst[i+15:i] := a[id+15:id] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpermw
__m128i _mm_permutexvar_epi16 (__m128i idx, __m128i a)

Synopsis

__m128i _mm_permutexvar_epi16 (__m128i idx, __m128i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 id := idx[i+2:i]*16 dst[i+15:i] := a[id+15:id] ENDFOR dst[MAX:128] := 0
vpermw
__m256i _mm256_mask_permutexvar_epi16 (__m256i src, __mmask16 k, __m256i idx, __m256i a)

Synopsis

__m256i _mm256_mask_permutexvar_epi16 (__m256i src, __mmask16 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 id := idx[i+3:i]*16 IF k[j] dst[i+15:i] := a[id+15:id] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpermw
__m256i _mm256_maskz_permutexvar_epi16 (__mmask16 k, __m256i idx, __m256i a)

Synopsis

__m256i _mm256_maskz_permutexvar_epi16 (__mmask16 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 id := idx[i+3:i]*16 IF k[j] dst[i+15:i] := a[id+15:id] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermw
__m256i _mm256_permutexvar_epi16 (__m256i idx, __m256i a)

Synopsis

__m256i _mm256_permutexvar_epi16 (__m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 id := idx[i+3:i]*16 dst[i+15:i] := a[id+15:id] ENDFOR dst[MAX:256] := 0
vpermw
__m512i _mm512_mask_permutexvar_epi16 (__m512i src, __mmask32 k, __m512i idx, __m512i a)

Synopsis

__m512i _mm512_mask_permutexvar_epi16 (__m512i src, __mmask32 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 id := idx[i+4:i]*16 IF k[j] dst[i+15:i] := a[id+15:id] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpermw
__m512i _mm512_maskz_permutexvar_epi16 (__mmask32 k, __m512i idx, __m512i a)

Synopsis

__m512i _mm512_maskz_permutexvar_epi16 (__mmask32 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 id := idx[i+4:i]*16 IF k[j] dst[i+15:i] := a[id+15:id] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermw
__m512i _mm512_permutexvar_epi16 (__m512i idx, __m512i a)

Synopsis

__m512i _mm512_permutexvar_epi16 (__m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 id := idx[i+4:i]*16 dst[i+15:i] := a[id+15:id] ENDFOR dst[MAX:512] := 0
vpermd
__m256i _mm256_mask_permutexvar_epi32 (__m256i src, __mmask8 k, __m256i idx, __m256i a)

Synopsis

__m256i _mm256_mask_permutexvar_epi32 (__m256i src, __mmask8 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpermd
__m256i _mm256_maskz_permutexvar_epi32 (__mmask8 k, __m256i idx, __m256i a)

Synopsis

__m256i _mm256_maskz_permutexvar_epi32 (__mmask8 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermd
__m256i _mm256_permutexvar_epi32 (__m256i idx, __m256i a)

Synopsis

__m256i _mm256_permutexvar_epi32 (__m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:256] := 0
vpermd
__m512i _mm512_mask_permutexvar_epi32 (__m512i src, __mmask16 k, __m512i idx, __m512i a)

Synopsis

__m512i _mm512_mask_permutexvar_epi32 (__m512i src, __mmask16 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpermd
__m512i _mm512_maskz_permutexvar_epi32 (__mmask16 k, __m512i idx, __m512i a)

Synopsis

__m512i _mm512_maskz_permutexvar_epi32 (__mmask16 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermd
__m512i _mm512_permutexvar_epi32 (__m512i idx, __m512i a)

Synopsis

__m512i _mm512_permutexvar_epi32 (__m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:512] := 0
vpermq
__m256i _mm256_mask_permutexvar_epi64 (__m256i src, __mmask8 k, __m256i idx, __m256i a)

Synopsis

__m256i _mm256_mask_permutexvar_epi64 (__m256i src, __mmask8 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermq
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 id := idx[i+1:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpermq
__m256i _mm256_maskz_permutexvar_epi64 (__mmask8 k, __m256i idx, __m256i a)

Synopsis

__m256i _mm256_maskz_permutexvar_epi64 (__mmask8 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermq
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 id := idx[i+1:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermq
__m256i _mm256_permutexvar_epi64 (__m256i idx, __m256i a)

Synopsis

__m256i _mm256_permutexvar_epi64 (__m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermq
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 id := idx[i+1:i]*64 dst[i+63:i] := a[id+63:id] ENDFOR dst[MAX:256] := 0
vpermq
__m512i _mm512_mask_permutexvar_epi64 (__m512i src, __mmask8 k, __m512i idx, __m512i a)

Synopsis

__m512i _mm512_mask_permutexvar_epi64 (__m512i src, __mmask8 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 id := idx[i+2:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpermq
__m512i _mm512_maskz_permutexvar_epi64 (__mmask8 k, __m512i idx, __m512i a)

Synopsis

__m512i _mm512_maskz_permutexvar_epi64 (__mmask8 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 id := idx[i+2:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermq
__m512i _mm512_permutexvar_epi64 (__m512i idx, __m512i a)

Synopsis

__m512i _mm512_permutexvar_epi64 (__m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 id := idx[i+2:i]*64 dst[i+63:i] := a[id+63:id] ENDFOR dst[MAX:512] := 0
vpermb
__m128i _mm_mask_permutexvar_epi8 (__m128i src, __mmask16 k, __m128i idx, __m128i a)

Synopsis

__m128i _mm_mask_permutexvar_epi8 (__m128i src, __mmask16 k, __m128i idx, __m128i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 id := idx[i+3:i]*8 IF k[j] dst[i+7:i] := a[id+7:id] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpermb
__m128i _mm_maskz_permutexvar_epi8 (__mmask16 k, __m128i idx, __m128i a)

Synopsis

__m128i _mm_maskz_permutexvar_epi8 (__mmask16 k, __m128i idx, __m128i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 id := idx[i+3:i]*8 IF k[j] dst[i+7:i] := a[id+7:id] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpermb
__m128i _mm_permutexvar_epi8 (__m128i idx, __m128i a)

Synopsis

__m128i _mm_permutexvar_epi8 (__m128i idx, __m128i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 id := idx[i+3:i]*8 dst[i+7:i] := a[id+7:id] ENDFOR dst[MAX:128] := 0
vpermb
__m256i _mm256_mask_permutexvar_epi8 (__m256i src, __mmask32 k, __m256i idx, __m256i a)

Synopsis

__m256i _mm256_mask_permutexvar_epi8 (__m256i src, __mmask32 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 id := idx[i+4:i]*8 IF k[j] dst[i+7:i] := a[id+7:id] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpermb
__m256i _mm256_maskz_permutexvar_epi8 (__mmask32 k, __m256i idx, __m256i a)

Synopsis

__m256i _mm256_maskz_permutexvar_epi8 (__mmask32 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 id := idx[i+4:i]*8 IF k[j] dst[i+7:i] := a[id+7:id] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermb
__m256i _mm256_permutexvar_epi8 (__m256i idx, __m256i a)

Synopsis

__m256i _mm256_permutexvar_epi8 (__m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI + AVX512VL

Description

Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 id := idx[i+4:i]*8 dst[i+7:i] := a[id+7:id] ENDFOR dst[MAX:256] := 0
vpermb
__m512i _mm512_mask_permutexvar_epi8 (__m512i src, __mmask64 k, __m512i idx, __m512i a)

Synopsis

__m512i _mm512_mask_permutexvar_epi8 (__m512i src, __mmask64 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI

Description

Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 id := idx[i+5:i]*8 IF k[j] dst[i+7:i] := a[id+7:id] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpermb
__m512i _mm512_maskz_permutexvar_epi8 (__mmask64 k, __m512i idx, __m512i a)

Synopsis

__m512i _mm512_maskz_permutexvar_epi8 (__mmask64 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI

Description

Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 id := idx[i+5:i]*8 IF k[j] dst[i+7:i] := a[id+7:id] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermb
__m512i _mm512_permutexvar_epi8 (__m512i idx, __m512i a)

Synopsis

__m512i _mm512_permutexvar_epi8 (__m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI

Description

Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 63 i := j*8 id := idx[i+5:i]*8 dst[i+7:i] := a[id+7:id] ENDFOR dst[MAX:512] := 0
vpermpd
__m256d _mm256_mask_permutexvar_pd (__m256d src, __mmask8 k, __m256i idx, __m256d a)

Synopsis

__m256d _mm256_mask_permutexvar_pd (__m256d src, __mmask8 k, __m256i idx, __m256d a)
#include "immintrin.h"
Instruction: vpermpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 id := idx[i+1:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpermpd
__m256d _mm256_maskz_permutexvar_pd (__mmask8 k, __m256i idx, __m256d a)

Synopsis

__m256d _mm256_maskz_permutexvar_pd (__mmask8 k, __m256i idx, __m256d a)
#include "immintrin.h"
Instruction: vpermpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 id := idx[i+1:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermpd
__m256d _mm256_permutexvar_pd (__m256i idx, __m256d a)

Synopsis

__m256d _mm256_permutexvar_pd (__m256i idx, __m256d a)
#include "immintrin.h"
Instruction: vpermpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 id := idx[i+1:i]*64 dst[i+63:i] := a[id+63:id] ENDFOR dst[MAX:256] := 0
vpermpd
__m512d _mm512_mask_permutexvar_pd (__m512d src, __mmask8 k, __m512i idx, __m512d a)

Synopsis

__m512d _mm512_mask_permutexvar_pd (__m512d src, __mmask8 k, __m512i idx, __m512d a)
#include "immintrin.h"
Instruction: vpermpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 id := idx[i+2:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpermpd
__m512d _mm512_maskz_permutexvar_pd (__mmask8 k, __m512i idx, __m512d a)

Synopsis

__m512d _mm512_maskz_permutexvar_pd (__mmask8 k, __m512i idx, __m512d a)
#include "immintrin.h"
Instruction: vpermpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 id := idx[i+2:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermpd
__m512d _mm512_permutexvar_pd (__m512i idx, __m512d a)

Synopsis

__m512d _mm512_permutexvar_pd (__m512i idx, __m512d a)
#include "immintrin.h"
Instruction: vpermpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 id := idx[i+2:i]*64 dst[i+63:i] := a[id+63:id] ENDFOR dst[MAX:512] := 0
vpermps
__m256 _mm256_mask_permutexvar_ps (__m256 src, __mmask8 k, __m256i idx, __m256 a)

Synopsis

__m256 _mm256_mask_permutexvar_ps (__m256 src, __mmask8 k, __m256i idx, __m256 a)
#include "immintrin.h"
Instruction: vpermps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpermps
__m256 _mm256_maskz_permutexvar_ps (__mmask8 k, __m256i idx, __m256 a)

Synopsis

__m256 _mm256_maskz_permutexvar_ps (__mmask8 k, __m256i idx, __m256 a)
#include "immintrin.h"
Instruction: vpermps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpermps
__m256 _mm256_permutexvar_ps (__m256i idx, __m256 a)

Synopsis

__m256 _mm256_permutexvar_ps (__m256i idx, __m256 a)
#include "immintrin.h"
Instruction: vpermps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.

Operation

FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:256] := 0
vpermps
__m512 _mm512_mask_permutexvar_ps (__m512 src, __mmask16 k, __m512i idx, __m512 a)

Synopsis

__m512 _mm512_mask_permutexvar_ps (__m512 src, __mmask16 k, __m512i idx, __m512 a)
#include "immintrin.h"
Instruction: vpermps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpermps
__m512 _mm512_maskz_permutexvar_ps (__mmask16 k, __m512i idx, __m512 a)

Synopsis

__m512 _mm512_maskz_permutexvar_ps (__mmask16 k, __m512i idx, __m512 a)
#include "immintrin.h"
Instruction: vpermps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpermps
__m512 _mm512_permutexvar_ps (__m512i idx, __m512 a)

Synopsis

__m512 _mm512_permutexvar_ps (__m512i idx, __m512 a)
#include "immintrin.h"
Instruction: vpermps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.

Operation

FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:512] := 0
pext
unsigned int _pext_u32 (unsigned int a, unsigned int mask)

Synopsis

unsigned int _pext_u32 (unsigned int a, unsigned int mask)
#include "immintrin.h"
Instruction: pext r32, r32, r32
CPUID Flags: BMI2

Description

Extract bits from unsigned 32-bit integer a at the corresponding bit locations specified by mask to contiguous low bits in dst; the remaining upper bits in dst are set to zero.

Operation

tmp := a dst := 0 m := 0 k := 0 DO WHILE m < 32 IF mask[m] = 1 dst[k] := tmp[m] k := k + 1 FI m := m + 1 OD

Performance

ArchitectureLatencyThroughput
Haswell3-
pext
unsigned __int64 _pext_u64 (unsigned __int64 a, unsigned __int64 mask)

Synopsis

unsigned __int64 _pext_u64 (unsigned __int64 a, unsigned __int64 mask)
#include "immintrin.h"
Instruction: pext r64, r64, r64
CPUID Flags: BMI2

Description

Extract bits from unsigned 64-bit integer a at the corresponding bit locations specified by mask to contiguous low bits in dst; the remaining upper bits in dst are set to zero.

Operation

tmp := a dst := 0 m := 0 k := 0 DO WHILE m < 64 IF mask[m] = 1 dst[k] := tmp[m] k := k + 1 FI m := m + 1 OD

Performance

ArchitectureLatencyThroughput
Haswell3-
pextrw
int _m_pextrw (__m64 a, int imm8)

Synopsis

int _m_pextrw (__m64 a, int imm8)
#include "xmmintrin.h"
Instruction: pextrw r32, mm, imm
CPUID Flags: SSE

Description

Extract a 16-bit integer from a, selected with imm8, and store the result in the lower element of dst.

Operation

dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0] dst[31:16] := 0
pinsrw
__m64 _m_pinsrw (__m64 a, int i, int imm8)

Synopsis

__m64 _m_pinsrw (__m64 a, int i, int imm8)
#include "xmmintrin.h"
Instruction: pinsrw mm, r32, imm
CPUID Flags: SSE

Description

Copy a to dst, and insert the 16-bit integer i into dst at the location specified by imm8.

Operation

dst[63:0] := a[63:0] sel := imm8[1:0]*16 dst[sel+15:sel] := i[15:0]
pmaxsw
__m64 _m_pmaxsw (__m64 a, __m64 b)

Synopsis

__m64 _m_pmaxsw (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pmaxsw mm, mm
CPUID Flags: SSE

Description

Compare packed 16-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 3 i := j*16 IF a[i+15:i] > b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
pmaxub
__m64 _m_pmaxub (__m64 a, __m64 b)

Synopsis

__m64 _m_pmaxub (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pmaxub mm, mm
CPUID Flags: SSE

Description

Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.

Operation

FOR j := 0 to 7 i := j*8 IF a[i+7:i] > b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
pminsw
__m64 _m_pminsw (__m64 a, __m64 b)

Synopsis

__m64 _m_pminsw (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pminsw mm, mm
CPUID Flags: SSE

Description

Compare packed 16-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 3 i := j*16 IF a[i+15:i] < b[i+15:i] dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := b[i+15:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
pminub
__m64 _m_pminub (__m64 a, __m64 b)

Synopsis

__m64 _m_pminub (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pminub mm, mm
CPUID Flags: SSE

Description

Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.

Operation

FOR j := 0 to 7 i := j*8 IF a[i+7:i] < b[i+7:i] dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := b[i+7:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
pmovmskb
int _m_pmovmskb (__m64 a)

Synopsis

int _m_pmovmskb (__m64 a)
#include "xmmintrin.h"
Instruction: pmovmskb r32, mm
CPUID Flags: SSE

Description

Create mask from the most significant bit of each 8-bit element in a, and store the result in dst.

Operation

FOR j := 0 to 7 i := j*8 dst[j] := a[i+7] ENDFOR dst[MAX:8] := 0
pmulhuw
__m64 _m_pmulhuw (__m64 a, __m64 b)

Synopsis

__m64 _m_pmulhuw (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pmulhuw mm, mm
CPUID Flags: SSE

Description

Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.

Operation

FOR j := 0 to 3 i := j*16 tmp[31:0] := a[i+15:i] * b[i+15:i] dst[i+15:i] := tmp[31:16] ENDFOR
popcnt
int _mm_popcnt_u32 (unsigned int a)

Synopsis

int _mm_popcnt_u32 (unsigned int a)
#include "nmmintrin.h"
Instruction: popcnt r32, r32
CPUID Flags: POPCNT

Description

Count the number of bits set to 1 in unsigned 32-bit integer a, and return that count in dst.

Operation

dst := 0 FOR i := 0 to 31 IF a[i] dst := dst + 1 FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
popcnt
__int64 _mm_popcnt_u64 (unsigned __int64 a)

Synopsis

__int64 _mm_popcnt_u64 (unsigned __int64 a)
#include "nmmintrin.h"
Instruction: popcnt r64, r64
CPUID Flags: POPCNT

Description

Count the number of bits set to 1 in unsigned 64-bit integer a, and return that count in dst.

Operation

dst := 0 FOR i := 0 to 63 IF a[i] dst := dst + 1 FI ENDFOR

Performance

ArchitectureLatencyThroughput
Westmere31
Nehalem31
popcnt
int _popcnt32 (int a)

Synopsis

int _popcnt32 (int a)
#include "immintrin.h"
Instruction: popcnt r32, r32
CPUID Flags: POPCNT

Description

Count the number of bits set to 1 in 32-bit integer a, and return that count in dst.

Operation

dst := 0 FOR i := 0 to 31 IF a[i] dst := dst + 1 FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge3-
Sandy Bridge3-
Westmere3-
Nehalem3-
popcnt
int _popcnt64 (__int64 a)

Synopsis

int _popcnt64 (__int64 a)
#include "immintrin.h"
Instruction: popcnt r64, r64
CPUID Flags: POPCNT

Description

Count the number of bits set to 1 in 64-bit integer a, and return that count in dst.

Operation

dst := 0 FOR i := 0 to 63 IF a[i] dst := dst + 1 FI ENDFOR

Performance

ArchitectureLatencyThroughput
Westmere31
Nehalem31
...
__m128d _mm_pow_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_pow_pd (__m128d a, __m128d b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the exponential value of packed double-precision (64-bit) floating-point elements in a raised by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i])^(b[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_pow_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_pow_pd (__m256d a, __m256d b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the exponential value of packed double-precision (64-bit) floating-point elements in a raised by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := (a[i+63:i])^(b[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_mask_pow_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_pow_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of packed double-precision (64-bit) floating-point elements in a raised by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i])^(b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_pow_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_pow_pd (__m512d a, __m512d b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of packed double-precision (64-bit) floating-point elements in a raised by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := (a[i+63:i])^(b[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128 _mm_pow_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_pow_ps (__m128 a, __m128 b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the exponential value of packed single-precision (32-bit) floating-point elements in a raised by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := (a[i+31:i])^(b[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_pow_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_pow_ps (__m256 a, __m256 b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the exponential value of packed single-precision (32-bit) floating-point elements in a raised by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := (a[i+31:i])^(b[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_mask_pow_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_pow_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of packed single-precision (32-bit) floating-point elements in a raised by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i])^(b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_pow_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_pow_ps (__m512 a, __m512 b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the exponential value of packed single-precision (32-bit) floating-point elements in a raised by packed elements in b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := (a[i+31:i])^(b[i+31:i]) ENDFOR dst[MAX:512] := 0
prefetchnta, prefetcht0, prefetcht1, prefetcht2
void _mm_prefetch (char const* p, int i)

Synopsis

void _mm_prefetch (char const* p, int i)
#include "xmmintrin.h"
Instruction: prefetchnta mprefetch
             prefetcht0 mprefetch
             prefetcht1 mprefetch
             prefetcht2 mprefetch
CPUID Flags: SSE

Description

Fetch the line of data from memory that contains address p to a location in the cache heirarchy specified by the locality hint i.
prefetchwt1
void _mm_prefetch (char const* p, int i)

Synopsis

void _mm_prefetch (char const* p, int i)
#include "xmmintrin.h"
Instruction: prefetchwt1 mprefetch
CPUID Flags: PREFETCHWT1

Description

Fetch the line of data from memory that contains address p to a location in the cache heirarchy specified by the locality hint i.
vprefetch0, vprefetch1, vprefetch2, vprefetchnta, vprefetche0, vprefetche1, vprefetche2, vprefetchenta
void _mm_prefetch (char const* p, int i)

Synopsis

void _mm_prefetch (char const* p, int i)
#include "xmmintrin.h"
Instruction: vprefetch0 mprefetch
             vprefetch1 mprefetch
             vprefetch2 mprefetch
             vprefetchnta mprefetch
             vprefetche0 mprefetch
             vprefetche1 mprefetch
             vprefetche2 mprefetch
             vprefetchenta mprefetch
CPUID Flags: KNCNI

Description

Fetch the line of data from memory that contains address p to a location in the cache heirarchy specified by the locality hint i.
vgatherpf0dps, vgatherpf1dps
void _mm512_mask_prefetch_i32extgather_ps (__m512i index, __mmask16 k, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)

Synopsis

void _mm512_mask_prefetch_i32extgather_ps (__m512i index, __mmask16 k, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0dps m512 {k}
             vgatherpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC

Description

Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address mv and 32-bit integer index vector index with scale scale to L1 or L2 level of cache depending on the value of hint. Gathered elements are merged in cache using writemask k (elements are brought into cache only when their corresponding mask bits are set). The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. The conv parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the conv parameter specified for the subsequent gather intrinsic.

Operation

FOR j := 0 to 15 addr := MEM[mv + index[j] * scale] i := j*32 IF k[j] THEN CASE hint OF _MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i]) _MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i]) ESAC FI ENDFOR dst[MAX:512] := 0
vgatherpf0dps, vgatherpf1dps
void _mm512_prefetch_i32extgather_ps (__m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)

Synopsis

void _mm512_prefetch_i32extgather_ps (__m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0dps m512 {k}
             vgatherpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC

Description

Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address mv and 32-bit integer index vector index with scale scale to L1 or L2 level of cache depending on the value of hint. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. The conv parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the conv parameter specified for the subsequent gather intrinsic.

Operation

FOR j := 0 to 15 addr := MEM[mv + index[j] * scale] i := j*32 CASE hint OF _MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i]) _MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i]) ESAC ENDFOR dst[MAX:512] := 0
vscatterpf0dps, vscatterpf1dps
void _mm512_mask_prefetch_i32extscatter_ps (void * mv, __mmask16 k, __m512i index, _MM_UPCONV_PS_ENUM conv, int scale, int hint)

Synopsis

void _mm512_mask_prefetch_i32extscatter_ps (void * mv, __mmask16 k, __m512i index, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0dps m512 {k}
             vscatterpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC

Description

Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address mv and 32-bit integer index vector index with scale scale to L1 or L2 level of cache depending on the value of hint. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. The conv parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the conv parameter specified for the subsequent gather intrinsic. Only those elements whose corresponding mask bit in k is set are loaded into cache.

Operation

cachev := 0 FOR j := 0 to 15 i := j*32 IF k[j] addr := MEM[mv + index[j] * scale] CASE hint OF _MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i]) _MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i]) ESAC FI ENDFOR
vscatterpf0dps, vscatterpf1dps
void _mm512_prefetch_i32extscatter_ps (void * mv, __m512i index, _MM_UPCONV_PS_ENUM conv, int scale, int hint)

Synopsis

void _mm512_prefetch_i32extscatter_ps (void * mv, __m512i index, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0dps m512 {k}
             vscatterpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC

Description

Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address mv and 32-bit integer index vector index with scale scale to L1 or L2 level of cache depending on the value of hint, with a request for exclusive ownership. The hint parameter may be one of the following: _MM_HINT_T0 = 1 for prefetching to L1 cache, _MM_HINT_T1 = 2 for prefetching to L2 cache, _MM_HINT_T2 = 3 for prefetching to L2 cache non-temporal, _MM_HINT_NTA = 0 for prefetching to L1 cache non-temporal. The conv parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the conv parameter specified for the subsequent scatter intrinsic.

Operation

cachev := 0 FOR j := 0 to 15 i := j*32 addr := MEM[mv + index[j] * scale] CASE hint OF _MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i]) _MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i]) _MM_HINT_T2: PrefetchL2WithT1HintNonTemporal(addr[i+31:i]) _MM_HINT_NTA: PrefetchL1WithT0HintNonTemporal(addr[i+31:i]) ESAC ENDFOR
vgatherpf0dpd, vgatherpf1dpd
void _mm512_mask_prefetch_i32gather_pd (__m256i vindex, __mmask8 mask, void const* base_addr, int scale, int hint)

Synopsis

void _mm512_mask_prefetch_i32gather_pd (__m256i vindex, __mmask8 mask, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0dpd vm32y {k}
             vgatherpf1dpd vm32y {k}
CPUID Flags: AVX512PF

Description

Prefetch double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged in cache using writemask k (elements are brought into cache only when their corresponding mask bits are set). scale should be 1, 2, 4 or 8. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.

Operation

FOR j := 0 to 7 i := j*32; IF mask[j] THEN Prefetch([base_addr + SignExtend(vindex[i*31:i]) * scale], hint, RFO=0); FI ENDFOR;
vgatherpf0dpd, vgatherpf1dpd
void _mm512_prefetch_i32gather_pd (__m256i vindex, void const* base_addr, int scale, int hint)

Synopsis

void _mm512_prefetch_i32gather_pd (__m256i vindex, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0dpd vm32y {k}
             vgatherpf1dpd vm32y {k}
CPUID Flags: AVX512PF

Description

Prefetch double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged in cache. scale should be 1, 2, 4 or 8. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.

Operation

FOR j := 0 to 7 i := j*32; Prefetch([base_addr + SignExtend(vindex[i*31:i]) * scale], hint, RFO=0); ENDFOR;
vgatherpf0dps, vgatherpf1dps
void _mm512_mask_prefetch_i32gather_ps (__m512i vindex, __mmask16 mask, void const* base_addr, int scale, int hint)

Synopsis

void _mm512_mask_prefetch_i32gather_ps (__m512i vindex, __mmask16 mask, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0dps vm32y {k}
             vgatherpf1dps vm32y {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC

Description

Prefetch single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged in cache using writemask k (elements are brought into cache only when their corresponding mask bits are set). scale should be 1, 2, 4 or 8. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.

Operation

FOR j := 0 to 15 i := j*16; IF mask[j] THEN Prefetch([base_addr + SignExtend(vindex[i*31:i]) * scale], hint, RFO=0); FI ENDFOR;
vgatherpf0dps, vgatherpf1dps
void _mm512_prefetch_i32gather_ps (__m512i index, void const* mv, int scale, int hint)

Synopsis

void _mm512_prefetch_i32gather_ps (__m512i index, void const* mv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0dps m512 {k}
             vgatherpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC

Description

Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location mv at packed 32-bit integer indices stored in index scaled by scale. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.

Operation

cachev := 0 FOR j := 0 to 15 i := j*32 addr := MEM[mv + index[j] * scale] cachev[i+31:i] := addr[i+63:i] ENDFOR
vscatterpf0dpd, vscatterpf1dpd
void _mm512_mask_prefetch_i32scatter_pd (void* base_addr, __mmask8 mask, __m256i vinde, int scale, int hint)

Synopsis

void _mm512_mask_prefetch_i32scatter_pd (void* base_addr, __mmask8 mask, __m256i vinde, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0dpd vm32y {k}
             vscatterpf1dpd vm32y {k}
CPUID Flags: AVX512PF

Description

Prefetch double-precision (64-bit) floating-point elements with intent to write using 32-bit indices. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not brought into cache when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 TO 7 i := j*32; IF mask[j] THEN Prefetch(base_addr + SignExtend(vindex[i+31:i]) * scale], Level=hint, RFO=1); FI ENDFOR;
vscatterpf0dpd, vscatterpf1dpd
void _mm512_prefetch_i32scatter_pd (void* base_addr, __m256i vindex, int scale, int hint)

Synopsis

void _mm512_prefetch_i32scatter_pd (void* base_addr, __m256i vindex, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0dpd vm32y {k}
             vscatterpf1dpd vm32y {k}
CPUID Flags: AVX512PF

Description

Prefetch double-precision (64-bit) floating-point elements with intent to write using 32-bit indices. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 TO 7 i := j*32; Prefetch(base_addr + SignExtend(vindex[i+31:i]) * scale], Level=hint, RFO=1); ENDFOR;
vscatterpf0dps, vscatterpf1dps
void _mm512_mask_prefetch_i32scatter_ps (void* mv, __mmask16 k, __m512i index, int scale, int hint)

Synopsis

void _mm512_mask_prefetch_i32scatter_ps (void* mv, __mmask16 k, __m512i index, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0dps m512 {k}
             vscatterpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC

Description

Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location mv at packed 32-bit integer indices stored in index scaled by scale. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. Only those elements whose corresponding mask bit in k is set are loaded into the desired cache.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] addr := MEM[mv + index[j] * scale] CASE hint OF _MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i]) _MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i]) _MM_HINT_T2: PrefetchL2WithT1HintNonTemporal(addr[i+31:i]) _MM_HINT_NTA: PrefetchL1WithT0HintNonTemporal(addr[i+31:i]) FI ENDFOR
vscatterpf0dps, vscatterpf1dps
void _mm512_prefetch_i32scatter_ps (void* mv, __m512i index, int scale, int hint)

Synopsis

void _mm512_prefetch_i32scatter_ps (void* mv, __m512i index, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0dps m512 {k}
             vscatterpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC

Description

Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location mv at packed 32-bit integer indices stored in index scaled by scale. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.

Operation

FOR j := 0 to 15 i := j*32 addr := MEM[mv + index[j] * scale] CASE hint OF _MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i]) _MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i]) _MM_HINT_T2: PrefetchL2WithT1HintNonTemporal(addr[i+31:i]) _MM_HINT_NTA: PrefetchL1WithT0HintNonTemporal(addr[i+31:i]) ESAC ENDFOR
vgatherpf0qpd, vgatherpf1qpd
void _mm512_mask_prefetch_i64gather_pd (__m512i vindex, __mmask8 mask, void const* base_addr, int scale, int hint)

Synopsis

void _mm512_mask_prefetch_i64gather_pd (__m512i vindex, __mmask8 mask, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0qpd vm32z {k}
             vgatherpf1qpd vm32z {k}
CPUID Flags: AVX512PF

Description

Prefetch double-precision (64-bit) floating-point elements from memory into cache level specified by hint using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Prefetched elements are merged in cache using writemask k (elements are copied from memory when the corresponding mask bit is set). scale should be 1, 2, 4 or 8. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.

Operation

FOR j := 0 to 7 i := j*64; IF mask[j] THEN Prefetch([base_addr + SignExtend(vindex[i*63:i] * scale]), Level=hint, RFO=0); FI ENDFOR;
vgatherpf0qpd, vgatherpf1qpd
void _mm512_prefetch_i64gather_pd (__m512i vindex, void const* base_addr, int scale, int hint)

Synopsis

void _mm512_prefetch_i64gather_pd (__m512i vindex, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0qpd vm32z {k}
             vgatherpf1qpd vm32z {k}
CPUID Flags: AVX512PF

Description

Prefetch double-precision (64-bit) floating-point elements from memory into cache level specified by hint using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.

Operation

FOR j := 0 to 7 i := j*64; Prefetch([base_addr + SignExtend(vindex[i*63:i] * scale]), Level=hint, RFO=0); ENDFOR;
vgatherpf0qps, vgatherpf1qps
void _mm512_mask_prefetch_i64gather_ps (__m512i vindex, __mmask8 mask, void const* base_addr, int scale, int hint)

Synopsis

void _mm512_mask_prefetch_i64gather_ps (__m512i vindex, __mmask8 mask, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0qps vm64z {k}
             vgatherpf1qps vm64z {k}
CPUID Flags: AVX512PF

Description

Prefetch single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged in cache using writemask k (elements are only brought into cache when their corresponding mask bit is set). scale should be 1, 2, 4 or 8.. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.

Operation

FOR j:= 0 to 7 i := j*64; IF mask[j] THEN Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], hint, RFO=0); FI ENDFOR;
vgatherpf0qps, vgatherpf1qps
void _mm512_prefetch_i64gather_ps (__m512i vindex, void const* base_addr, int scale, int hint)

Synopsis

void _mm512_prefetch_i64gather_ps (__m512i vindex, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0qps vm64z {k}
             vgatherpf1qps vm64z {k}
CPUID Flags: AVX512PF

Description

Prefetch single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged in cache. scale should be 1, 2, 4 or 8. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.

Operation

FOR j:= 0 to 7 i := j*64; Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], hint, RFO=0); ENDFOR;
vscatterpf0qpd, vscatterpf1qpd
void _mm512_mask_prefetch_i64scatter_pd (void* base_addr, __mmask8 mask, __m512i vindex, int scale, int hint)

Synopsis

void _mm512_mask_prefetch_i64scatter_pd (void* base_addr, __mmask8 mask, __m512i vindex, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0qpd vm32z {k}
             vscatterpf1qpd vm32z {k}
CPUID Flags: AVX512PF

Description

Prefetch double-precision (64-bit) floating-point elements with intent to write into memory using 64-bit indices. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not brought into cache when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64; IF mask[j] THEN Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], Level=hint, RFO=1); FI ENDFOR;
vscatterpf0qpd, vscatterpf1qpd
void _mm512_prefetch_i64scatter_pd (void* base_addr, __m512i vindex, int scale, int hint)

Synopsis

void _mm512_prefetch_i64scatter_pd (void* base_addr, __m512i vindex, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0qpd vm32z {k}
             vscatterpf1qpd vm32z {k}
CPUID Flags: AVX512PF

Description

Prefetch double-precision (64-bit) floating-point elements with intent to write into memory using 64-bit indices. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64; Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], Level=hint, RFO=1); ENDFOR;
vscatterpf0qps, vscatterpf1qps
void _mm512_mask_prefetch_i64scatter_ps (void* base_addr, __mmask8 mask, __m512i vindex, int scale, int hint)

Synopsis

void _mm512_mask_prefetch_i64scatter_ps (void* base_addr, __mmask8 mask, __m512i vindex, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0qps vm64z {k}
             vscatterpf1qps vm64z {k}
CPUID Flags: AVX512PF

Description

Prefetch single-precision (32-bit) floating-point elements with intent to write into memory using 64-bit indices. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not brought into cache when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64; IF mask[j] THEN Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], Level=hint, RFO=1); FI ENDFOR;
vscatterpf0qps, vscatterpf1qps
void _mm512_prefetch_i64scatter_ps (void* base_addr, __m512i vindex, int scale, int hint)

Synopsis

void _mm512_prefetch_i64scatter_ps (void* base_addr, __m512i vindex, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0qps vm64z {k}
             vscatterpf1qps vm64z {k}
CPUID Flags: AVX512PF

Description

Prefetch single-precision (32-bit) floating-point elements with intent to write into memory using 64-bit indices. Elements are prefetched into cache level hint, where hint is 0 or 1. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.

Operation

FOR j := 0 to 7 i := j*64; Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], Level=hint, RFO=1); ENDFOR;
psadbw
__m64 _m_psadbw (__m64 a, __m64 b)

Synopsis

__m64 _m_psadbw (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: psadbw mm, mm
CPUID Flags: SSE

Description

Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of dst.

Operation

FOR j := 0 to 7 i := j*8 tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) ENDFOR dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
pshufw
__m64 _m_pshufw (__m64 a, int imm8)

Synopsis

__m64 _m_pshufw (__m64 a, int imm8)
#include "xmmintrin.h"
Instruction: pshufw mm, mm, imm
CPUID Flags: SSE

Description

Shuffle 16-bit integers in a using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[15:0] := src[15:0] 1: tmp[15:0] := src[31:16] 2: tmp[15:0] := src[47:32] 3: tmp[15:0] := src[63:48] ESAC RETURN tmp[15:0] } dst[15:0] := SELECT4(a[63:0], imm8[1:0]) dst[31:16] := SELECT4(a[63:0], imm8[3:2]) dst[47:32] := SELECT4(a[63:0], imm8[5:4]) dst[63:48] := SELECT4(a[63:0], imm8[7:6])
vrangepd
__m128d _mm_mask_range_pd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8)

Synopsis

__m128d _mm_mask_range_pd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512VL + AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vrangepd
__m128d _mm_maskz_range_pd (__mmask8 k, __m128d a, __m128d b, int imm8)

Synopsis

__m128d _mm_maskz_range_pd (__mmask8 k, __m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512VL + AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vrangepd
__m128d _mm_range_pd (__m128d a, __m128d b, int imm8)

Synopsis

__m128d _mm_range_pd (__m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512VL + AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } FOR j := 0 to 1 i := j*64 dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ENDFOR dst[MAX:128] := 0
vrangepd
__m256d _mm256_mask_range_pd (__m256d src, __mmask8 k, __m256d a, __m256d b, int imm8)

Synopsis

__m256d _mm256_mask_range_pd (__m256d src, __mmask8 k, __m256d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512VL + AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vrangepd
__m256d _mm256_maskz_range_pd (__mmask8 k, __m256d a, __m256d b, int imm8)

Synopsis

__m256d _mm256_maskz_range_pd (__mmask8 k, __m256d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512VL + AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vrangepd
__m256d _mm256_range_pd (__m256d a, __m256d b, int imm8)

Synopsis

__m256d _mm256_range_pd (__m256d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512VL + AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } FOR j := 0 to 3 i := j*64 dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ENDFOR dst[MAX:256] := 0
vrangepd
__m512d _mm512_mask_range_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int imm8)

Synopsis

__m512d _mm512_mask_range_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vrangepd
__m512d _mm512_maskz_range_pd (__mmask8 k, __m512d a, __m512d b, int imm8)

Synopsis

__m512d _mm512_maskz_range_pd (__mmask8 k, __m512d a, __m512d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vrangepd
__m512d _mm512_range_pd (__m512d a, __m512d b, int imm8)

Synopsis

__m512d _mm512_range_pd (__m512d a, __m512d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } FOR j := 0 to 7 i := j*64 dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ENDFOR dst[MAX:512] := 0
vrangeps
__m128 _mm_mask_range_ps (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8)

Synopsis

__m128 _mm_mask_range_ps (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512VL + AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[63:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vrangeps
__m128 _mm_maskz_range_ps (__mmask8 k, __m128 a, __m128 b, int imm8)

Synopsis

__m128 _mm_maskz_range_ps (__mmask8 k, __m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512VL + AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[63:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vrangeps
__m128 _mm_range_ps (__m128 a, __m128 b, int imm8)

Synopsis

__m128 _mm_range_ps (__m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512VL + AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[63:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } FOR j := 0 to 3 i := j*32 dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ENDFOR dst[MAX:128] := 0
vrangeps
__m256 _mm256_mask_range_ps (__m256 src, __mmask8 k, __m256 a, __m256 b, int imm8)

Synopsis

__m256 _mm256_mask_range_ps (__m256 src, __mmask8 k, __m256 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512VL + AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[63:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vrangeps
__m256 _mm256_maskz_range_ps (__mmask8 k, __m256 a, __m256 b, int imm8)

Synopsis

__m256 _mm256_maskz_range_ps (__mmask8 k, __m256 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512VL + AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[63:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vrangeps
__m256 _mm256_range_ps (__m256 a, __m256 b, int imm8)

Synopsis

__m256 _mm256_range_ps (__m256 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512VL + AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[63:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } FOR j := 0 to 7 i := j*32 dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ENDFOR dst[MAX:256] := 0
vrangeps
__m512 _mm512_mask_range_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int imm8)

Synopsis

__m512 _mm512_mask_range_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[63:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vrangeps
__m512 _mm512_maskz_range_ps (__mmask16 k, __m512 a, __m512 b, int imm8)

Synopsis

__m512 _mm512_maskz_range_ps (__mmask16 k, __m512 a, __m512 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[63:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vrangeps
__m512 _mm512_range_ps (__m512 a, __m512 b, int imm8)

Synopsis

__m512 _mm512_range_ps (__m512 a, __m512 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[63:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } FOR j := 0 to 15 i := j*32 dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ENDFOR dst[MAX:512] := 0
vrangepd
__m512d _mm512_mask_range_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int imm8, int rounding)

Synopsis

__m512d _mm512_mask_range_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vrangepd
__m512d _mm512_maskz_range_round_pd (__mmask8 k, __m512d a, __m512d b, int imm8, int rounding)

Synopsis

__m512d _mm512_maskz_range_round_pd (__mmask8 k, __m512d a, __m512d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vrangepd
__m512d _mm512_range_round_pd (__m512d a, __m512d b, int imm8, int rounding)

Synopsis

__m512d _mm512_range_round_pd (__m512d a, __m512d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } FOR j := 0 to 7 i := j*64 dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ENDFOR dst[MAX:512] := 0
vrangeps
__m512 _mm512_mask_range_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int imm8, int rounding)

Synopsis

__m512 _mm512_mask_range_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[63:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vrangeps
__m512 _mm512_maskz_range_round_ps (__mmask16 k, __m512 a, __m512 b, int imm8, int rounding)

Synopsis

__m512 _mm512_maskz_range_round_ps (__mmask16 k, __m512 a, __m512 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[63:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vrangeps
__m512 _mm512_range_round_ps (__m512 a, __m512 b, int imm8, int rounding)

Synopsis

__m512 _mm512_range_round_ps (__m512 a, __m512 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[63:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } FOR j := 0 to 15 i := j*32 dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ENDFOR dst[MAX:512] := 0
vrangesd
__m128d _mm_mask_range_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8, int rounding)

Synopsis

__m128d _mm_mask_range_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangesd
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } IF k[0] dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vrangesd
__m128d _mm_maskz_range_round_sd (__mmask8 k, __m128d a, __m128d b, int imm8, int rounding)

Synopsis

__m128d _mm_maskz_range_round_sd (__mmask8 k, __m128d a, __m128d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangesd
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } IF k[0] dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vrangesd
__m128d _mm_range_round_sd (__m128d a, __m128d b, int imm8, int rounding)

Synopsis

__m128d _mm_range_round_sd (__m128d a, __m128d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangesd
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vrangess
__m128 _mm_mask_range_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8, int rounding)

Synopsis

__m128 _mm_mask_range_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangess
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[31:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } IF k[0] dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vrangess
__m128 _mm_maskz_range_round_ss (__mmask8 k, __m128 a, __m128 b, int imm8, int rounding)

Synopsis

__m128 _mm_maskz_range_round_ss (__mmask8 k, __m128 a, __m128 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangess
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[31:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } IF k[0] dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vrangess
__m128 _mm_range_round_ss (__m128 a, __m128 b, int imm8, int rounding)

Synopsis

__m128 _mm_range_round_ss (__m128 a, __m128 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangess
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[31:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vrangesd
__m128d _mm_mask_range_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8)

Synopsis

__m128d _mm_mask_range_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vrangesd
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } IF k[0] dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vrangesd
__m128d _mm_maskz_range_sd (__mmask8 k, __m128d a, __m128d b, int imm8)

Synopsis

__m128d _mm_maskz_range_sd (__mmask8 k, __m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vrangesd
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] ESAC CASE signSelCtl[1:0] 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) 1: dst[63:0] := tmp[63:0] 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) ESAC RETURN dst } IF k[0] dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vrangess
__m128 _mm_mask_range_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8)

Synopsis

__m128 _mm_mask_range_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vrangess
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[31:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } IF k[0] dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vrangess
__m128 _mm_maskz_range_ss (__mmask8 k, __m128 a, __m128 b, int imm8)

Synopsis

__m128 _mm_maskz_range_ss (__mmask8 k, __m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vrangess
CPUID Flags: AVX512DQ

Description

Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.

Operation

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] ESAC CASE signSelCtl[1:0] 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) 1: dst[31:0] := tmp[31:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } IF k[0] dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
rcpps
__m128 _mm_rcp_ps (__m128 a)

Synopsis

__m128 _mm_rcp_ps (__m128 a)
#include "xmmintrin.h"
Instruction: rcpps xmm, xmm
CPUID Flags: SSE

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 1.5*2^-12.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge51
Sandy Bridge51
Westmere32
Nehalem32
vrcpps
__m256 _mm256_rcp_ps (__m256 a)

Synopsis

__m256 _mm256_rcp_ps (__m256 a)
#include "immintrin.h"
Instruction: vrcpps ymm, ymm
CPUID Flags: AVX

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 1.5*2^-12.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell71
Ivy Bridge71
Sandy Bridge71
rcpss
__m128 _mm_rcp_ss (__m128 a)

Synopsis

__m128 _mm_rcp_ss (__m128 a)
#include "xmmintrin.h"
Instruction: rcpss xmm, xmm
CPUID Flags: SSE

Description

Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 1.5*2^-12.

Operation

dst[31:0] := APPROXIMATE(1.0/a[31:0]) dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge51
Sandy Bridge51
Westmere33
Nehalem33
vrcp14pd
__m128d _mm_mask_rcp14_pd (__m128d src, __mmask8 k, __m128d a)

Synopsis

__m128d _mm_mask_rcp14_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vrcp14pd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vrcp14pd
__m128d _mm_maskz_rcp14_pd (__mmask8 k, __m128d a)

Synopsis

__m128d _mm_maskz_rcp14_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vrcp14pd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vrcp14pd
__m128d _mm_rcp14_pd (__m128d a)

Synopsis

__m128d _mm_rcp14_pd (__m128d a)
#include "immintrin.h"
Instruction: vrcp14pd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ENDFOR dst[MAX:128] := 0
vrcp14pd
__m256d _mm256_mask_rcp14_pd (__m256d src, __mmask8 k, __m256d a)

Synopsis

__m256d _mm256_mask_rcp14_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vrcp14pd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vrcp14pd
__m256d _mm256_maskz_rcp14_pd (__mmask8 k, __m256d a)

Synopsis

__m256d _mm256_maskz_rcp14_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vrcp14pd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vrcp14pd
__m256d _mm256_rcp14_pd (__m256d a)

Synopsis

__m256d _mm256_rcp14_pd (__m256d a)
#include "immintrin.h"
Instruction: vrcp14pd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ENDFOR dst[MAX:256] := 0
vrcp14pd
__m512d _mm512_mask_rcp14_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_rcp14_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrcp14pd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vrcp14pd
__m512d _mm512_maskz_rcp14_pd (__mmask8 k, __m512d a)

Synopsis

__m512d _mm512_maskz_rcp14_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrcp14pd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vrcp14pd
__m512d _mm512_rcp14_pd (__m512d a)

Synopsis

__m512d _mm512_rcp14_pd (__m512d a)
#include "immintrin.h"
Instruction: vrcp14pd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ENDFOR dst[MAX:512] := 0
vrcp14ps
__m128 _mm_mask_rcp14_ps (__m128 src, __mmask8 k, __m128 a)

Synopsis

__m128 _mm_mask_rcp14_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vrcp14ps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vrcp14ps
__m128 _mm_maskz_rcp14_ps (__mmask8 k, __m128 a)

Synopsis

__m128 _mm_maskz_rcp14_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vrcp14ps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vrcp14ps
__m128 _mm_rcp14_ps (__m128 a)

Synopsis

__m128 _mm_rcp14_ps (__m128 a)
#include "immintrin.h"
Instruction: vrcp14ps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ENDFOR dst[MAX:128] := 0
vrcp14ps
__m256 _mm256_mask_rcp14_ps (__m256 src, __mmask8 k, __m256 a)

Synopsis

__m256 _mm256_mask_rcp14_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vrcp14ps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vrcp14ps
__m256 _mm256_maskz_rcp14_ps (__mmask8 k, __m256 a)

Synopsis

__m256 _mm256_maskz_rcp14_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vrcp14ps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vrcp14ps
__m256 _mm256_rcp14_ps (__m256 a)

Synopsis

__m256 _mm256_rcp14_ps (__m256 a)
#include "immintrin.h"
Instruction: vrcp14ps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ENDFOR dst[MAX:256] := 0
vrcp14ps
__m512 _mm512_mask_rcp14_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_rcp14_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrcp14ps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vrcp14ps
__m512 _mm512_maskz_rcp14_ps (__mmask16 k, __m512 a)

Synopsis

__m512 _mm512_maskz_rcp14_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrcp14ps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vrcp14ps
__m512 _mm512_rcp14_ps (__m512 a)

Synopsis

__m512 _mm512_rcp14_ps (__m512 a)
#include "immintrin.h"
Instruction: vrcp14ps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ENDFOR dst[MAX:512] := 0
vrcp14sd
__m128d _mm_mask_rcp14_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_rcp14_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrcp14sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.

Operation

IF k[0] dst[63:0] := APPROXIMATE(1.0/b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vrcp14sd
__m128d _mm_maskz_rcp14_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_rcp14_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrcp14sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.

Operation

IF k[0] dst[63:0] := APPROXIMATE(1.0/b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vrcp14sd
__m128d _mm_rcp14_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_rcp14_sd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrcp14sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.

Operation

dst[63:0] := APPROXIMATE(1.0/b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
vrcp14ss
__m128 _mm_mask_rcp14_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_rcp14_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrcp14ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.

Operation

IF k[0] dst[31:0] := APPROXIMATE(1.0/b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vrcp14ss
__m128 _mm_maskz_rcp14_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_rcp14_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrcp14ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.

Operation

IF k[0] dst[31:0] := APPROXIMATE(1.0/b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vrcp14ss
__m128 _mm_rcp14_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_rcp14_ss (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrcp14ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.

Operation

dst[31:0] := APPROXIMATE(1.0/b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
vrcp23ps
__m512 _mm512_mask_rcp23_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_rcp23_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrcp23ps zmm {k}, m512
CPUID Flags: KNCNI

Description

Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in a to 23 bits of precision, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vrcp23ps
__m512 _mm512_rcp23_ps (__m512 a)

Synopsis

__m512 _mm512_rcp23_ps (__m512 a)
#include "immintrin.h"
Instruction: vrcp23ps zmm {k}, m512
CPUID Flags: KNCNI

Description

Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in a to 23 bits of precision, storing the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ENDFOR dst[MAX:512] := 0
vrcp28pd
__m512d _mm512_mask_rcp28_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_rcp28_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrcp28pd zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.

Operation

FOR j := 0 to 7 i := j*64; IF k[j] THEN dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i]; ELSE dst[i+63:i] := src[i+63:i]; FI ENDFOR;
vrcp28pd
__m512d _mm512_maskz_rcp28_pd (__mmask8 k, __m512d a)

Synopsis

__m512d _mm512_maskz_rcp28_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrcp28pd zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.

Operation

FOR j := 0 to 7 i := j*64; IF k[j] THEN dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i]; ELSE dst[i+63:i] := 0; FI ENDFOR;
vrcp28pd
__m512d _mm512_rcp28_pd (__m512d a)

Synopsis

__m512d _mm512_rcp28_pd (__m512d a)
#include "immintrin.h"
Instruction: vrcp28pd zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-28.

Operation

FOR j := 0 to 7 i := j*64; dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i]; ENDFOR;
vrcp28ps
__m512 _mm512_mask_rcp28_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_rcp28_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrcp28ps zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.

Operation

FOR j := 0 to 15 i := j*32; IF k[j] THEN dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i]; ELSE dst[i+31:i] := src[i+31:i]; FI ENDFOR;
vrcp28ps
__m512 _mm512_maskz_rcp28_ps (__mmask16 k, __m512 a)

Synopsis

__m512 _mm512_maskz_rcp28_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrcp28ps zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.

Operation

FOR j := 0 to 15 i := j*32; IF k[j] THEN dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i]; ELSE dst[i+31:i] := 0; FI ENDFOR;
vrcp28ps
__m512 _mm512_rcp28_ps (__m512 a)

Synopsis

__m512 _mm512_rcp28_ps (__m512 a)
#include "immintrin.h"
Instruction: vrcp28ps zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-28.

Operation

FOR j := 0 to 15 i := j*32; dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i]; ENDFOR;
vrcp28pd
__m512d _mm512_mask_rcp28_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)

Synopsis

__m512d _mm512_mask_rcp28_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vrcp28pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64; IF k[j] THEN dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i]; ELSE dst[i+63:i] := src[i+63:i]; FI ENDFOR;
vrcp28pd
__m512d _mm512_maskz_rcp28_round_pd (__mmask8 k, __m512d a, int rounding)

Synopsis

__m512d _mm512_maskz_rcp28_round_pd (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vrcp28pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64; IF k[j] THEN dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i]; ELSE dst[i+63:i] := 0; FI ENDFOR;
vrcp28pd
__m512d _mm512_rcp28_round_pd (__m512d a, int rounding)

Synopsis

__m512d _mm512_rcp28_round_pd (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vrcp28pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64; dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i]; ENDFOR;
vrcp28ps
__m512 _mm512_mask_rcp28_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)

Synopsis

__m512 _mm512_mask_rcp28_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vrcp28ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32; IF k[j] THEN dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i]; ELSE dst[i+31:i] := src[i+31:i]; FI ENDFOR;
vrcp28ps
__m512 _mm512_maskz_rcp28_round_ps (__mmask16 k, __m512 a, int rounding)

Synopsis

__m512 _mm512_maskz_rcp28_round_ps (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vrcp28ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32; IF k[j] THEN dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i]; ELSE dst[i+31:i] := 0; FI ENDFOR;
vrcp28ps
__m512 _mm512_rcp28_round_ps (__m512 a, int rounding)

Synopsis

__m512 _mm512_rcp28_round_ps (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vrcp28ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32; dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i]; ENDFOR;
vrcp28sd
__m128d _mm_mask_rcp28_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_mask_rcp28_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vrcp28sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] THEN dst[63:0] := RCP_28_DP(1.0/b[63:0]; ELSE dst[63:0] := src[63:0]; FI dst[127:64] := a[127:64]; dst[MAX:128] := 0;
vrcp28sd
__m128d _mm_maskz_rcp28_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_maskz_rcp28_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vrcp28sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] THEN dst[63:0] := RCP_28_DP(1.0/b[63:0]; ELSE dst[63:0] := 0; FI dst[127:64] := a[127:64]; dst[MAX:128] := 0;
vrcp28sd
__m128d _mm_rcp28_round_sd (__m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_rcp28_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vrcp28sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := RCP_28_DP(1.0/b[63:0]; dst[127:64] := a[127:64]; dst[MAX:128] := 0;
vrcp28ss
__m128 _mm_mask_rcp28_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_mask_rcp28_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vrcp28ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] THEN dst[31:0] := RCP_28_DP(1.0/b[31:0]; ELSE dst[31:0] := src[31:0]; FI dst[127:32] := a[127:32]; dst[MAX:128] := 0;
vrcp28ss
__m128 _mm_maskz_rcp28_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_maskz_rcp28_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vrcp28ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] THEN dst[31:0] := RCP_28_DP(1.0/b[31:0]; ELSE dst[31:0] := 0; FI dst[127:32] := a[127:32]; dst[MAX:128] := 0;
vrcp28ss
__m128 _mm_rcp28_round_ss (__m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_rcp28_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vrcp28ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst. The maximum relative error for this approximation is less than 2^-28, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := RCP_28_DP(1.0/b[31:0]; dst[127:32] := a[127:32]; dst[MAX:128] := 0;
vrcp28sd
__m128d _mm_mask_rcp28_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_rcp28_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrcp28sd xmm {k}, xmm, xmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.

Operation

IF k[0] THEN dst[63:0] := RCP_28_DP(1.0/b[63:0]; ELSE dst[63:0] := src[63:0]; FI dst[127:64] := a[127:64]; dst[MAX:128] := 0;
vrcp28sd
__m128d _mm_maskz_rcp28_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_rcp28_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrcp28sd xmm {k}, xmm, xmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.

Operation

IF k[0] THEN dst[63:0] := RCP_28_DP(1.0/b[63:0]; ELSE dst[63:0] := 0; FI dst[127:64] := a[127:64]; dst[MAX:128] := 0;
vrcp28sd
__m128d _mm_rcp28_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_rcp28_sd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrcp28sd xmm {k}, xmm, xmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.

Operation

dst[63:0] := RCP_28_DP(1.0/b[63:0]; dst[127:64] := a[127:64]; dst[MAX:128] := 0;
vrcp28ss
__m128 _mm_mask_rcp28_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_rcp28_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrcp28ss xmm {k}, xmm, xmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.

Operation

IF k[0] THEN dst[31:0] := RCP_28_DP(1.0/b[31:0]; ELSE dst[31:0] := src[31:0]; FI dst[127:32] := a[127:32]; dst[MAX:128] := 0;
vrcp28ss
__m128 _mm_maskz_rcp28_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_rcp28_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrcp28ss xmm {k}, xmm, xmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.

Operation

IF k[0] THEN dst[31:0] := RCP_28_DP(1.0/b[31:0]; ELSE dst[31:0] := 0; FI dst[127:32] := a[127:32]; dst[MAX:128] := 0;
vrcp28ss
__m128 _mm_rcp28_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_rcp28_ss (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrcp28ss xmm {k}, xmm, xmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.

Operation

dst[31:0] := RCP_28_DP(1.0/b[31:0]; dst[127:32] := a[127:32]; dst[MAX:128] := 0;
rdpmc
__int64 _rdpmc (int a)

Synopsis

__int64 _rdpmc (int a)
#include "immintrin.h"
Instruction: rdpmc

Description

Read the Performance Monitor Counter (PMC) specified by a, and store up to 64-bits in dst. The width of performance counters is implementation specific.

Operation

dst[63:0] := ReadPMC(a)
rdrand
int _rdrand16_step (unsigned short* val)

Synopsis

int _rdrand16_step (unsigned short* val)
#include "immintrin.h"
Instruction: rdrand r16
CPUID Flags: RDRAND

Description

Read a hardware generated 16-bit random value and store the result in val. Return 1 if a random value was generated, and 0 otherwise.

Operation

IF HW_RND_GEN.ready = 1 val[15:0] := HW_RND_GEN.data; RETURN 1; ELSE val[15:0] := 0; RETURN 0; FI
rdrand
int _rdrand32_step (unsigned int* val)

Synopsis

int _rdrand32_step (unsigned int* val)
#include "immintrin.h"
Instruction: rdrand r32
CPUID Flags: RDRAND

Description

Read a hardware generated 32-bit random value and store the result in val. Return 1 if a random value was generated, and 0 otherwise.

Operation

IF HW_RND_GEN.ready = 1 val[31:0] := HW_RND_GEN.data; RETURN 1; ELSE val[31:0] := 0; RETURN 0; FI
rdrand
int _rdrand64_step (unsigned __int64* val)

Synopsis

int _rdrand64_step (unsigned __int64* val)
#include "immintrin.h"
Instruction: rdrand r64
CPUID Flags: RDRAND

Description

Read a hardware generated 64-bit random value and store the result in val. Return 1 if a random value was generated, and 0 otherwise.

Operation

IF HW_RND_GEN.ready = 1 val[63:0] := HW_RND_GEN.data; RETURN 1; ELSE val[63:0] := 0; RETURN 0; FI

Performance

ArchitectureLatencyThroughput
HaswellVariesVaries
Ivy Bridge200200
rdseed
int _rdseed16_step (unsigned short * val)

Synopsis

int _rdseed16_step (unsigned short * val)
#include "immintrin.h"
Instruction: rdseed r16
CPUID Flags: RDSEED

Description

Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store in val. Return 1 if a random value was generated, and 0 otherwise.

Operation

IF HW_NRND_GEN.ready = 1 THEN val[15:0] := HW_NRND_GEN.data RETURN 1 ELSE val[15:0] := 0 RETURN 0 FI
rdseed
int _rdseed32_step (unsigned int * val)

Synopsis

int _rdseed32_step (unsigned int * val)
#include "immintrin.h"
Instruction: rdseed r32
CPUID Flags: RDSEED

Description

Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store in val. Return 1 if a random value was generated, and 0 otherwise.

Operation

IF HW_NRND_GEN.ready = 1 THEN val[31:0] := HW_NRND_GEN.data RETURN 1 ELSE val[31:0] := 0 RETURN 0 FI
rdseed
int _rdseed64_step (unsigned __int64 * val)

Synopsis

int _rdseed64_step (unsigned __int64 * val)
#include "immintrin.h"
Instruction: rdseed r64
CPUID Flags: RDSEED

Description

Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store in val. Return 1 if a random value was generated, and 0 otherwise.

Operation

IF HW_NRND_GEN.ready = 1 THEN val[63:0] := HW_NRND_GEN.data RETURN 1 ELSE val[63:0] := 0 RETURN 0 FI
rdtsc
__int64 _rdtsc (void)

Synopsis

__int64 _rdtsc (void)
#include "immintrin.h"
Instruction: rdtsc
CPUID Flags: TSC

Description

Copy the current 64-bit value of the processor's time-stamp counter into dst.

Operation

dst[63:0] := TimeStampCounter
rdtscp
unsigned __int64 __rdtscp (unsigned int * mem_addr)

Synopsis

unsigned __int64 __rdtscp (unsigned int * mem_addr)
#include "immintrin.h"
Instruction: rdtscp
CPUID Flags: RDTSCP

Description

Copy the current 64-bit value of the processor's time-stamp counter into dst, and store the IA32_TSC_AUX MSR (signature value) into memory at mem_addr.

Operation

dst[63:0] := TimeStampCounter MEM[mem_addr+31:mem_addr] := IA32_TSC_AUX[31:0]

Performance

ArchitectureLatencyThroughput
Westmere9-
Nehalem9-
rdfsbase
unsigned int _readfsbase_u32 ()

Synopsis

unsigned int _readfsbase_u32 ()
#include "immintrin.h"
Instruction: rdfsbase r32
CPUID Flags: FSGSBASE

Description

Read the FS segment base register and store the 32-bit result in dst.

Operation

dst[31:0] := FS_Segment_Base_Register; dst[63:32] := 0
rdfsbase
unsigned __int64 _readfsbase_u64 ()

Synopsis

unsigned __int64 _readfsbase_u64 ()
#include "immintrin.h"
Instruction: rdfsbase r64
CPUID Flags: FSGSBASE

Description

Read the FS segment base register and store the 64-bit result in dst.

Operation

dst[63:0] := FS_Segment_Base_Register;
rdgsbase
unsigned int _readgsbase_u32 ()

Synopsis

unsigned int _readgsbase_u32 ()
#include "immintrin.h"
Instruction: rdgsbase r32
CPUID Flags: FSGSBASE

Description

Read the GS segment base register and store the 32-bit result in dst.

Operation

dst[31:0] := GS_Segment_Base_Register; dst[63:32] := 0
rdgsbase
unsigned __int64 _readgsbase_u64 ()

Synopsis

unsigned __int64 _readgsbase_u64 ()
#include "immintrin.h"
Instruction: rdgsbase r64
CPUID Flags: FSGSBASE

Description

Read the GS segment base register and store the 64-bit result in dst.

Operation

dst[63:0] := GS_Segment_Base_Register;
...
__m512d _mm512_mask_recip_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_recip_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Computes the reciprocal of packed double-precision (64-bit) floating-point elements in a, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (1 / a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_recip_pd (__m512d a)

Synopsis

__m512d _mm512_recip_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Computes the reciprocal of packed double-precision (64-bit) floating-point elements in a, storing the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := (1 / a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_recip_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_recip_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Computes the reciprocal of packed single-precision (32-bit) floating-point elements in a, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (1 / a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_recip_ps (__m512 a)

Synopsis

__m512 _mm512_recip_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Computes the reciprocal of packed single-precision (32-bit) floating-point elements in a, storing the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := (1 / a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
int _mm512_mask_reduce_add_epi32 (__mmask16 k, __m512i a)

Synopsis

int _mm512_mask_reduce_add_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.

Operation

sum[31:0] := 0 FOR j := 0 to 15 i := j*32 IF k[j] sum[31:0] := sum[31:0] + a[i+31:i] FI ENDFOR RETURN sum[31:0]
...
int _mm512_reduce_add_epi32 (__m512i a)

Synopsis

int _mm512_reduce_add_epi32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 32-bit integers in a by addition. Returns the sum of all elements in a.

Operation

sum[31:0] := 0 FOR j := 0 to 15 i := j*32 sum[31:0] := sum[31:0] + a[i+31:i] ENDFOR RETURN sum[63:0]
...
__int64 _mm512_mask_reduce_add_epi64 (__mmask8 k, __m512i a)

Synopsis

__int64 _mm512_mask_reduce_add_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.

Operation

sum[63:0] := 0 FOR j := 0 to 7 i := j*64 IF k[j] sum[63:0] := sum[63:0] + a[i+63:i] FI ENDFOR RETURN sum[63:0]
...
__int64 _mm512_reduce_add_epi64 (__m512i a)

Synopsis

__int64 _mm512_reduce_add_epi64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a.

Operation

sum[63:0] := 0 FOR j := 0 to 7 i := j*64 sum[63:0] := sum[63:0] + a[i+63:i] ENDFOR RETURN sum[63:0]
...
double _mm512_mask_reduce_add_pd (__mmask8 k, __m512d a)

Synopsis

double _mm512_mask_reduce_add_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed double-precision (64-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.

Operation

sum[63:0] := 0 FOR j := 0 to 7 i := j*64 IF k[j] sum[63:0] := sum[63:0] + a[i+63:i] FI ENDFOR RETURN sum[63:0]
...
double _mm512_reduce_add_pd (__m512d a)

Synopsis

double _mm512_reduce_add_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed double-precision (64-bit) floating-point elements in a by addition. Returns the sum of all elements in a.

Operation

sum[63:0] := 0 FOR j := 0 to 7 i := j*64 sum[63:0] := sum[63:0] + a[i+63:i] ENDFOR RETURN sum[63:0]
...
float _mm512_mask_reduce_add_ps (__mmask16 k, __m512 a)

Synopsis

float _mm512_mask_reduce_add_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed single-precision (32-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.

Operation

sum[31:0] := 0 FOR j := 0 to 15 i := j*32 IF k[j] sum[31:0] := sum[31:0] + a[i+31:i] FI ENDFOR RETURN sum[31:0]
...
float _mm512_reduce_add_ps (__m512 a)

Synopsis

float _mm512_reduce_add_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed single-precision (32-bit) floating-point elements in a by addition. Returns the sum of all elements in a.

Operation

sum[31:0] := 0 FOR j := 0 to 15 i := j*32 sum[31:0] := sum[31:0] + a[i+31:i] ENDFOR RETURN sum[63:0]
...
int _mm512_mask_reduce_and_epi32 (__mmask16 k, __m512i a)

Synopsis

int _mm512_mask_reduce_and_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 32-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.

Operation

reduced[31:0] := 0xFFFFFFFF FOR j := 0 to 15 i := j*32 IF k[j] reduced[31:0] := reduced[31:0] AND a[i+31:i] FI ENDFOR RETURN reduced[31:0]
...
int _mm512_reduce_and_epi32 (__m512i a)

Synopsis

int _mm512_reduce_and_epi32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 32-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.

Operation

reduced[31:0] := 0xFFFFFFFF FOR j := 0 to 15 i := j*32 reduced[31:0] := reduced[31:0] AND a[i+31:i] ENDFOR RETURN reduced[31:0]
...
__int64 _mm512_mask_reduce_and_epi64 (__mmask8 k, __m512i a)

Synopsis

__int64 _mm512_mask_reduce_and_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 64-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.

Operation

reduced[63:0] := 0xFFFFFFFFFFFFFFFF FOR j := 0 to 7 i := j*64 IF k[j] reduced[63:0] := reduced[63:0] AND a[i+63:i] FI ENDFOR RETURN reduced[63:0]
...
__int64 _mm512_reduce_and_epi64 (__m512i a)

Synopsis

__int64 _mm512_reduce_and_epi64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 64-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.

Operation

reduced[63:0] := 0xFFFFFFFFFFFFFFFF FOR j := 0 to 7 i := j*64 reduced[63:0] := reduced[63:0] AND a[i+63:i] ENDFOR RETURN reduced[63:0]
...
double _mm512_mask_reduce_gmax_pd (__mmask8 k, __m512d a)

Synopsis

double _mm512_mask_reduce_gmax_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in a and stores the result in dst. Bitmask k is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).

Operation

max = a[63:0] FOR j := 1 to 7 i := j*64 IF k[j] CONTINUE ELSE dst = FpMax(max, a[i+63:i]) FI ENDFOR dst := max
...
double _mm512_reduce_gmax_pd (__m512d a)

Synopsis

double _mm512_reduce_gmax_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in a and stores the result in dst.

Operation

max = a[63:0] FOR j := 1 to 7 i := j*64 dst = FpMax(max, a[i+63:i]) ENDFOR dst := max
...
float _mm512_mask_reduce_gmax_ps (__mmask16 k, __m512 a)

Synopsis

float _mm512_mask_reduce_gmax_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in a and stores the result in dst. Bitmask k is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).

Operation

max = a[31:0] FOR j := 1 to 15 i := j*32 IF k[j] CONTINUE ELSE dst = FpMax(max, a[i+31:i]) FI ENDFOR dst := max
...
float _mm512_reduce_gmax_ps (__m512 a)

Synopsis

float _mm512_reduce_gmax_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in a and stores the result in dst.

Operation

max = a[31:0] FOR j := 1 to 15 i := j*32 dst = FpMax(max, a[i+31:i]) ENDFOR dst := max
...
double _mm512_mask_reduce_gmin_pd (__mmask8 k, __m512d a)

Synopsis

double _mm512_mask_reduce_gmin_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in a and stores the result in dst. Bitmask k is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).

Operation

min = a[63:0] FOR j := 1 to 7 i := j*64 IF k[j] CONTINUE ELSE dst = FpMin(min, a[i+63:i]) FI ENDFOR dst := min
...
double _mm512_reduce_gmin_pd (__m512d a)

Synopsis

double _mm512_reduce_gmin_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in a and stores the result in dst.

Operation

min = a[63:0] FOR j := 1 to 7 i := j*64 dst = FpMin(min, a[i+63:i]) ENDFOR dst := min
...
float _mm512_mask_reduce_gmin_ps (__mmask16 k, __m512 a)

Synopsis

float _mm512_mask_reduce_gmin_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in a and stores the result in dst using writemask k (elements are ignored when the corresponding mask bit is not set).

Operation

min = a[31:0] FOR j := 1 to 15 i := j*32 IF k[j] CONTINUE ELSE dst = FpMin(min, a[i+31:i]) FI ENDFOR dst := min
...
float _mm512_reduce_gmin_ps (__m512 a)

Synopsis

float _mm512_reduce_gmin_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in a and stores the result in dst.

Operation

min = a[31:0] FOR j := 1 to 15 i := j*32 dst = FpMin(min, a[i+31:i]) ENDFOR dst := min
...
int _mm512_mask_reduce_max_epi32 (__mmask16 k, __m512i a)

Synopsis

int _mm512_mask_reduce_max_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.

Operation

max[31:0] := MIN_INT FOR j := 0 to 15 i := j*32 IF k[j] max[31:0] := MAXIMUM(max[31:0], a[i+31:i]) FI ENDFOR RETURN max[31:0]
...
int _mm512_reduce_max_epi32 (__m512i a)

Synopsis

int _mm512_reduce_max_epi32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 32-bit integers in a by maximum. Returns the maximum of all elements in a.

Operation

max[31:0] := MIN_INT FOR j := 0 to 15 i := j*32 max[31:0] := MAXIMUM(max[31:0], a[i+31:i]) ENDFOR RETURN max[31:0]
...
__int64 _mm512_mask_reduce_max_epi64 (__mmask8 k, __m512i a)

Synopsis

__int64 _mm512_mask_reduce_max_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.

Operation

max[63:0] := MIN_INT FOR j := 0 to 7 i := j*64 IF k[j] max[63:0] := MAXIMUM(max[63:0], a[i+63:i]) FI ENDFOR RETURN max[63:0]
...
__int64 _mm512_reduce_max_epi64 (__m512i a)

Synopsis

__int64 _mm512_reduce_max_epi64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 64-bit integers in a by maximum. Returns the maximum of all elements in a.

Operation

max[63:0] := MIN_INT FOR j := 0 to 7 i := j*64 max[63:0] := MAXIMUM(max[63:0], a[i+63:i]) ENDFOR RETURN max[63:0]
...
unsigned int _mm512_mask_reduce_max_epu32 (__mmask16 k, __m512i a)

Synopsis

unsigned int _mm512_mask_reduce_max_epu32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.

Operation

max[31:0] := 0 FOR j := 0 to 15 i := j*32 IF k[j] max[31:0] := MAXIMUM(max[31:0], a[i+31:i]) FI ENDFOR RETURN max[31:0]
...
unsigned int _mm512_reduce_max_epu32 (__m512i a)

Synopsis

unsigned int _mm512_reduce_max_epu32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed unsigned 32-bit integers in a by maximum. Returns the maximum of all elements in a.

Operation

max[31:0] := 0 FOR j := 0 to 15 i := j*32 max[31:0] := MAXIMUM(max[31:0], a[i+31:i]) ENDFOR RETURN max[31:0]
...
unsigned __int64 _mm512_mask_reduce_max_epu64 (__mmask8 k, __m512i a)

Synopsis

unsigned __int64 _mm512_mask_reduce_max_epu64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed unsigned 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.

Operation

max[63:0] := 0 FOR j := 0 to 7 i := j*64 IF k[j] max[63:0] := MAXIMUM(max[63:0], a[i+63:i]) FI ENDFOR RETURN max[63:0]
...
unsigned __int64 _mm512_reduce_max_epu64 (__m512i a)

Synopsis

unsigned __int64 _mm512_reduce_max_epu64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed unsigned 64-bit integers in a by maximum. Returns the maximum of all elements in a.

Operation

max[63:0] := 0 FOR j := 0 to 7 i := j*64 max[63:0] := MAXIMUM(max[63:0], a[i+63:i]) ENDFOR RETURN max[63:0]
...
double _mm512_mask_reduce_max_pd (__mmask8 k, __m512d a)

Synopsis

double _mm512_mask_reduce_max_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.

Operation

max[63:0] := MIN_DOUBLE FOR j := 0 to 7 i := j*64 IF k[j] max[63:0] := MAXIMUM(max[63:0], a[i+63:i]) FI ENDFOR RETURN max[63:0]
...
double _mm512_reduce_max_pd (__m512d a)

Synopsis

double _mm512_reduce_max_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed double-precision (64-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.

Operation

max[63:0] := MIN_DOUBLE FOR j := 0 to 7 i := j*64 max[63:0] := MAXIMUM(max[63:0], a[i+63:i]) ENDFOR RETURN max[63:0]
...
float _mm512_mask_reduce_max_ps (__mmask16 k, __m512 a)

Synopsis

float _mm512_mask_reduce_max_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.

Operation

max[31:0] := MIN_FLOAT FOR j := 0 to 15 i := j*32 IF k[j] max[31:0] := MAXIMUM(max[31:0], a[i+31:i]) FI ENDFOR RETURN max[31:0]
...
float _mm512_reduce_max_ps (__m512 a)

Synopsis

float _mm512_reduce_max_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed single-precision (32-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.

Operation

max[31:0] := MIN_FLOAT FOR j := 0 to 15 i := j*32 max[31:0] := MAXIMUM(max[31:0], a[i+31:i]) ENDFOR RETURN max[31:0]
...
int _mm512_mask_reduce_min_epi32 (__mmask16 k, __m512i a)

Synopsis

int _mm512_mask_reduce_min_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.

Operation

min[31:0] := MAX_INT FOR j := 0 to 15 i := j*32 IF k[j] min[31:0] := MINIMUM(min[31:0], a[i+31:i]) FI ENDFOR RETURN min[31:0]
...
int _mm512_reduce_min_epi32 (__m512i a)

Synopsis

int _mm512_reduce_min_epi32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 32-bit integers in a by minimum. Returns the minimum of all elements in a.

Operation

min[31:0] := MAX_INT FOR j := 0 to 15 i := j*32 min[31:0] := MINIMUM(min[31:0], a[i+31:i]) ENDFOR RETURN min[31:0]
...
__int64 _mm512_mask_reduce_min_epi64 (__mmask8 k, __m512i a)

Synopsis

__int64 _mm512_mask_reduce_min_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.

Operation

min[63:0] := MAX_INT FOR j := 0 to 7 i := j*64 IF k[j] min[63:0] := MINIMUM(min[63:0], a[i+63:i]) FI ENDFOR RETURN min[63:0]
...
__int64 _mm512_reduce_min_epi64 (__m512i a)

Synopsis

__int64 _mm512_reduce_min_epi64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 64-bit integers in a by minimum. Returns the minimum of all elements in a.

Operation

min[63:0] := MAX_INT FOR j := 0 to 7 i := j*64 min[63:0] := MINIMUM(min[63:0], a[i+63:i]) ENDFOR RETURN min[63:0]
...
unsigned int _mm512_mask_reduce_min_epu32 (__mmask16 k, __m512i a)

Synopsis

unsigned int _mm512_mask_reduce_min_epu32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.

Operation

min[31:0] := MAX_UINT FOR j := 0 to 15 i := j*32 IF k[j] min[31:0] := MINIMUM(min[31:0], a[i+31:i]) FI ENDFOR RETURN min[31:0]
...
unsigned int _mm512_reduce_min_epu32 (__m512i a)

Synopsis

unsigned int _mm512_reduce_min_epu32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed unsigned 32-bit integers in a by minimum. Returns the minimum of all elements in a.

Operation

min[31:0] := MAX_UINT FOR j := 0 to 15 i := j*32 min[31:0] := MINIMUM(min[31:0], a[i+31:i]) ENDFOR RETURN min[31:0]
...
unsigned __int64 _mm512_mask_reduce_min_epu64 (__mmask8 k, __m512i a)

Synopsis

unsigned __int64 _mm512_mask_reduce_min_epu64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed unsigned 64-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.

Operation

min[63:0] := MAX_UINT FOR j := 0 to 7 i := j*64 IF k[j] min[63:0] := MINIMUM(min[63:0], a[i+63:i]) FI ENDFOR RETURN min[63:0]
...
unsigned __int64 _mm512_reduce_min_epu64 (__m512i a)

Synopsis

unsigned __int64 _mm512_reduce_min_epu64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed unsigned 64-bit integers in a by minimum. Returns the minimum of all elements in a.

Operation

min[63:0] := MAX_UINT FOR j := 0 to 7 i := j*64 min[63:0] := MINIMUM(min[63:0], a[i+63:i]) ENDFOR RETURN min[63:0]
...
double _mm512_mask_reduce_min_pd (__mmask8 k, __m512d a)

Synopsis

double _mm512_mask_reduce_min_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.

Operation

min[63:0] := MAX_DOUBLE FOR j := 0 to 7 i := j*64 IF k[j] min[63:0] := MINIMUM(min[63:0], a[i+63:i]) FI ENDFOR RETURN min[63:0]
...
double _mm512_reduce_min_pd (__m512d a)

Synopsis

double _mm512_reduce_min_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed double-precision (64-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.

Operation

min[63:0] := MAX_DOUBLE FOR j := 0 to 7 i := j*64 min[63:0] := MINIMUM(min[63:0], a[i+63:i]) ENDFOR RETURN min[63:0]
...
float _mm512_mask_reduce_min_ps (__mmask16 k, __m512 a)

Synopsis

float _mm512_mask_reduce_min_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.

Operation

min[31:0] := MAX_FLOAT FOR j := 0 to 15 i := j*32 IF k[j] min[31:0] := MINIMUM(min[31:0], a[i+31:i]) FI ENDFOR RETURN min[31:0]
...
float _mm512_reduce_min_ps (__m512 a)

Synopsis

float _mm512_reduce_min_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed single-precision (32-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.

Operation

min[31:0] := MAX_INT FOR j := 0 to 15 i := j*32 min[31:0] := MINIMUM(min[31:0], a[i+31:i]) ENDFOR RETURN min[31:0]
...
int _mm512_mask_reduce_mul_epi32 (__mmask16 k, __m512i a)

Synopsis

int _mm512_mask_reduce_mul_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 32-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.

Operation

prod[31:0] := 1 FOR j := 0 to 15 i := j*32 IF k[j] prod[31:0] := prod[31:0] * a[i+31:i] FI ENDFOR RETURN prod[31:0]
...
int _mm512_reduce_mul_epi32 (__m512i a)

Synopsis

int _mm512_reduce_mul_epi32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 32-bit integers in a by multiplication. Returns the product of all elements in a.

Operation

prod[31:0] := 1 FOR j := 0 to 15 i := j*32 prod[31:0] := prod[31:0] * a[i+31:i] ENDFOR RETURN prod[31:0]
...
__int64 _mm512_mask_reduce_mul_epi64 (__mmask8 k, __m512i a)

Synopsis

__int64 _mm512_mask_reduce_mul_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.

Operation

prod[63:0] := 1 FOR j := 0 to 7 i := j*64 IF k[j] prod[63:0] := prod[63:0] * a[i+63:i] FI ENDFOR RETURN prod[63:0]
...
__int64 _mm512_reduce_mul_epi64 (__m512i a)

Synopsis

__int64 _mm512_reduce_mul_epi64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 64-bit integers in a by multiplication. Returns the product of all elements in a.

Operation

prod[63:0] := 1 FOR j := 0 to 7 i := j*64 prod[63:0] := prod[63:0] * a[i+63:i] ENDFOR RETURN prod[63:0]
...
double _mm512_mask_reduce_mul_pd (__mmask8 k, __m512d a)

Synopsis

double _mm512_mask_reduce_mul_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.

Operation

prod[63:0] := 1 FOR j := 0 to 7 i := j*64 IF k[j] prod[63:0] := prod[63:0] * a[i+63:i] FI ENDFOR RETURN prod[63:0]
...
double _mm512_reduce_mul_pd (__m512d a)

Synopsis

double _mm512_reduce_mul_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.

Operation

prod[63:0] := 1 FOR j := 0 to 7 i := j*64 prod[63:0] := prod[63:0] * a[i+63:i] ENDFOR RETURN prod[63:0]
...
float _mm512_mask_reduce_mul_ps (__mmask16 k, __m512 a)

Synopsis

float _mm512_mask_reduce_mul_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.

Operation

prod[31:0] := 1 FOR j := 0 to 15 i := j*32 IF k[j] prod[31:0] := prod[31:0] * a[i+31:i] FI ENDFOR RETURN prod[31:0]
...
float _mm512_reduce_mul_ps (__m512 a)

Synopsis

float _mm512_reduce_mul_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.

Operation

prod[31:0] := 1 FOR j := 0 to 15 i := j*32 prod[31:0] := prod[31:0] * a[i+31:i] ENDFOR RETURN prod[31:0]
...
int _mm512_mask_reduce_or_epi32 (__mmask16 k, __m512i a)

Synopsis

int _mm512_mask_reduce_or_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 32-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.

Operation

reduced[31:0] := 0 FOR j := 0 to 15 i := j*32 IF k[j] reduced[31:0] := reduced[31:0] OR a[i+31:i] FI ENDFOR RETURN reduced[31:0]
...
int _mm512_reduce_or_epi32 (__m512i a)

Synopsis

int _mm512_reduce_or_epi32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 32-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.

Operation

reduced[31:0] := 0 FOR j := 0 to 15 i := j*32 reduced[31:0] := reduced[31:0] OR a[i+31:i] ENDFOR RETURN reduced[31:0]
...
__int64 _mm512_mask_reduce_or_epi64 (__mmask8 k, __m512i a)

Synopsis

__int64 _mm512_mask_reduce_or_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 64-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.

Operation

reduced[63:0] := 0 FOR j := 0 to 7 i := j*64 IF k[j] reduced[63:0] := reduced[63:0] OR a[i+63:i] FI ENDFOR RETURN reduced[63:0]
...
__int64 _mm512_reduce_or_epi64 (__m512i a)

Synopsis

__int64 _mm512_reduce_or_epi64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Reduce the packed 64-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.

Operation

reduced[63:0] := 0 FOR j := 0 to 7 i := j*64 reduced[63:0] := reduced[63:0] OR a[i+63:i] ENDFOR RETURN reduced[63:0]
vreducepd
__m128d _mm_mask_reduce_pd (__m128d src, __mmask8 k, __m128d a, int imm8)

Synopsis

__m128d _mm_mask_reduce_pd (__m128d src, __mmask8 k, __m128d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vreducepd
__m128d _mm_maskz_reduce_pd (__mmask8 k, __m128d a, int imm8)

Synopsis

__m128d _mm_maskz_reduce_pd (__mmask8 k, __m128d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vreducepd
__m128d _mm_reduce_pd (__m128d a, int imm8)

Synopsis

__m128d _mm_reduce_pd (__m128d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } FOR j := 0 to 1 i := j*64 dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
vreducepd
__m256d _mm256_mask_reduce_pd (__m256d src, __mmask8 k, __m256d a, int imm8)

Synopsis

__m256d _mm256_mask_reduce_pd (__m256d src, __mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vreducepd
__m256d _mm256_maskz_reduce_pd (__mmask8 k, __m256d a, int imm8)

Synopsis

__m256d _mm256_maskz_reduce_pd (__mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vreducepd
__m256d _mm256_reduce_pd (__m256d a, int imm8)

Synopsis

__m256d _mm256_reduce_pd (__m256d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } FOR j := 0 to 3 i := j*64 dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
vreducepd
__m512d _mm512_mask_reduce_pd (__m512d src, __mmask8 k, __m512d a, int imm8)

Synopsis

__m512d _mm512_mask_reduce_pd (__m512d src, __mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vreducepd
__m512d _mm512_maskz_reduce_pd (__mmask8 k, __m512d a, int imm8)

Synopsis

__m512d _mm512_maskz_reduce_pd (__mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vreducepd
__m512d _mm512_reduce_pd (__m512d a, int imm8)

Synopsis

__m512d _mm512_reduce_pd (__m512d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vreduceps
__m128 _mm_mask_reduce_ps (__m128 src, __mmask8 k, __m128 a, int imm8)

Synopsis

__m128 _mm_mask_reduce_ps (__m128 src, __mmask8 k, __m128 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vreduceps
__m128 _mm_maskz_reduce_ps (__mmask8 k, __m128 a, int imm8)

Synopsis

__m128 _mm_maskz_reduce_ps (__mmask8 k, __m128 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vreduceps
__m128 _mm_reduce_ps (__m128 a, int imm8)

Synopsis

__m128 _mm_reduce_ps (__m128 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } FOR j := 0 to 3 i := j*32 dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
vreduceps
__m256 _mm256_mask_reduce_ps (__m256 src, __mmask8 k, __m256 a, int imm8)

Synopsis

__m256 _mm256_mask_reduce_ps (__m256 src, __mmask8 k, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vreduceps
__m256 _mm256_maskz_reduce_ps (__mmask8 k, __m256 a, int imm8)

Synopsis

__m256 _mm256_maskz_reduce_ps (__mmask8 k, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vreduceps
__m256 _mm256_reduce_ps (__m256 a, int imm8)

Synopsis

__m256 _mm256_reduce_ps (__m256 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512VL + AVX512DQ

Description

Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } FOR j := 0 to 7 i := j*32 dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
vreduceps
__m512 _mm512_mask_reduce_ps (__m512 src, __mmask16 k, __m512 a, int imm8)

Synopsis

__m512 _mm512_mask_reduce_ps (__m512 src, __mmask16 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vreduceps
__m512 _mm512_maskz_reduce_ps (__mmask16 k, __m512 a, int imm8)

Synopsis

__m512 _mm512_maskz_reduce_ps (__mmask16 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vreduceps
__m512 _mm512_reduce_ps (__m512 a, int imm8)

Synopsis

__m512 _mm512_reduce_ps (__m512 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vreducepd
__m512d _mm512_mask_reduce_round_pd (__m512d src, __mmask8 k, __m512d a, int imm8, int rounding)

Synopsis

__m512d _mm512_mask_reduce_round_pd (__m512d src, __mmask8 k, __m512d a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vreducepd
__m512d _mm512_maskz_reduce_round_pd (__mmask8 k, __m512d a, int imm8, int rounding)

Synopsis

__m512d _mm512_maskz_reduce_round_pd (__mmask8 k, __m512d a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vreducepd
__m512d _mm512_reduce_round_pd (__m512d a, int imm8, int rounding)

Synopsis

__m512d _mm512_reduce_round_pd (__m512d a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vreduceps
__m512 _mm512_mask_reduce_round_ps (__m512 src, __mmask16 k, __m512 a, int imm8, int rounding)

Synopsis

__m512 _mm512_mask_reduce_round_ps (__m512 src, __mmask16 k, __m512 a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vreduceps
__m512 _mm512_maskz_reduce_round_ps (__mmask16 k, __m512 a, int imm8, int rounding)

Synopsis

__m512 _mm512_maskz_reduce_round_ps (__mmask16 k, __m512 a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vreduceps
__m512 _mm512_reduce_round_ps (__m512 a, int imm8, int rounding)

Synopsis

__m512 _mm512_reduce_round_ps (__m512 a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vreducesd
__m128d _mm_mask_reduce_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8, int rounding)

Synopsis

__m128d _mm_mask_reduce_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducesd
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of the lower double-precision (64-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } IF k[0] dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vreducesd
__m128d _mm_maskz_reduce_round_sd (__mmask8 k, __m128d a, __m128d b, int imm8, int rounding)

Synopsis

__m128d _mm_maskz_reduce_round_sd (__mmask8 k, __m128d a, __m128d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducesd
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of the lower double-precision (64-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } IF k[0] dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vreducesd
__m128d _mm_reduce_round_sd (__m128d a, __m128d b, int imm8, int rounding)

Synopsis

__m128d _mm_reduce_round_sd (__m128d a, __m128d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducesd
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of the lower double-precision (64-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0]) dst[127:64] := b[127:64] dst[MAX:128] := 0
vreducess
__m128 _mm_mask_reduce_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8, int rounding)

Synopsis

__m128 _mm_mask_reduce_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducess
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of the lower single-precision (32-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } IF k[0] dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0]) ELSE dst[31:0] := src[31:0] FI dst[127:64] := b[127:32] dst[MAX:128] := 0
vreducess
__m128 _mm_maskz_reduce_round_ss (__mmask8 k, __m128 a, __m128 b, int imm8, int rounding)

Synopsis

__m128 _mm_maskz_reduce_round_ss (__mmask8 k, __m128 a, __m128 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducess
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of the lower single-precision (32-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } IF k[0] dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0]) ELSE dst[31:0] := 0 FI dst[127:64] := b[127:32] dst[MAX:128] := 0
vreducess
__m128 _mm_reduce_round_ss (__m128 a, __m128 b, int imm8, int rounding)

Synopsis

__m128 _mm_reduce_round_ss (__m128 a, __m128 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducess
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of the lower single-precision (32-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from b to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0]) dst[127:64] := b[127:32] dst[MAX:128] := 0
vreducesd
__m128d _mm_mask_reduce_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8)

Synopsis

__m128d _mm_mask_reduce_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vreducesd
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of the lower double-precision (64-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } IF k[0] dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vreducesd
__m128d _mm_maskz_reduce_sd (__mmask8 k, __m128d a, __m128d b, int imm8)

Synopsis

__m128d _mm_maskz_reduce_sd (__mmask8 k, __m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vreducesd
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of the lower double-precision (64-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } IF k[0] dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vreducesd
__m128d _mm_reduce_sd (__m128d a, __m128d b, int imm8)

Synopsis

__m128d _mm_reduce_sd (__m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vreducesd
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of the lower double-precision (64-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst.

Operation

ReduceArgumentPD(src1[63:0], imm8[7:0]) { m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc) tmp[63:0] := src1[63:0] - tmp[63:0] RETURN tmp[63:0] } dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0]) dst[127:64] := b[127:64] dst[MAX:128] := 0
vreducess
__m128 _mm_mask_reduce_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8)

Synopsis

__m128 _mm_mask_reduce_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vreducess
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of the lower single-precision (32-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } IF k[0] dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0]) ELSE dst[31:0] := src[31:0] FI dst[127:64] := b[127:32] dst[MAX:128] := 0
vreducess
__m128 _mm_maskz_reduce_ss (__mmask8 k, __m128 a, __m128 b, int imm8)

Synopsis

__m128 _mm_maskz_reduce_ss (__mmask8 k, __m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vreducess
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of the lower single-precision (32-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } IF k[0] dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0]) ELSE dst[31:0] := 0 FI dst[127:64] := b[127:32] dst[MAX:128] := 0
vreducess
__m128 _mm_reduce_ss (__m128 a, __m128 b, int imm8)

Synopsis

__m128 _mm_reduce_ss (__m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vreducess
CPUID Flags: AVX512DQ

Description

Extract the reduced argument of the lower single-precision (32-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from b to the upper elements of dst.

Operation

ReduceArgumentPS(src1[31:0], imm8[7:0]) { IF src1[31:0] == NAN RETURN (convert src1[31:0] to QNaN) FI m := imm8[7:4] // number of fraction bits after the binary point to be preserved rc := imm8[1:0] // round control rc_src := imm8[2] // round ccontrol source spe := 0 tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc) tmp[31:0] := src1[31:0] - tmp[31:0] RETURN tmp[31:0] } dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0]) dst[127:64] := b[127:32] dst[MAX:128] := 0
...
__m128i _mm_rem_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_rem_epi16 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed 16-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 7 i := 16*j dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_rem_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_rem_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed 16-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 15 i := 16*j dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_rem_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_rem_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed 16-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 31 i := 16*j dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:512] := 0
...
__m128i _mm_rem_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_rem_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed 32-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_rem_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_rem_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed 32-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_mask_rem_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_rem_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed 32-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512i _mm512_rem_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_rem_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed 32-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m128i _mm_rem_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_rem_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed 64-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 1 i := 64*j dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_rem_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_rem_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed 64-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 3 i := 64*j dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_rem_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_rem_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed 64-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 7 i := 64*j dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128i _mm_rem_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_rem_epi8 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed 8-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 15 i := 8*j dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_rem_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_rem_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed 8-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 31 i := 8*j dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_rem_epi8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_rem_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed 8-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.

Operation

FOR j := 0 to 63 i := 8*j dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:512] := 0
...
__m128i _mm_rem_epu16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_rem_epu16 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed unsigned 16-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 7 i := 16*j dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_rem_epu16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_rem_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed unsigned 16-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 15 i := 16*j dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_rem_epu16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_rem_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed unsigned 16-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 31 i := 16*j dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:512] := 0
...
__m128i _mm_rem_epu32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_rem_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_rem_epu32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_rem_epu32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_mask_rem_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_rem_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512i _mm512_rem_epu32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_rem_epu32 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 15 i := 32*j dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m128i _mm_rem_epu64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_rem_epu64 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed unsigned 64-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 1 i := 64*j dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_rem_epu64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_rem_epu64 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed unsigned 64-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 3 i := 64*j dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_rem_epu64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_rem_epu64 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed unsigned 64-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 7 i := 64*j dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128i _mm_rem_epu8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_rem_epu8 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed unsigned 8-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 15 i := 8*j dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_rem_epu8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_rem_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed unsigned 8-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 31 i := 8*j dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_rem_epu8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_rem_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Divide packed unsigned 8-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 63 i := 8*j dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_mask_rint_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_rint_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Rounds the packed double-precision (64-bit) floating-point elements in a to the nearest even integer value and stores the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RoundToNearestEven(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_rint_pd (__m512d a)

Synopsis

__m512d _mm512_rint_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Rounds the packed double-precision (64-bit) floating-point elements in a to the nearest even integer value and stores the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := RoundToNearestEven(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_mask_rint_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_rint_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Rounds the packed single-precision (32-bit) floating-point elements in a to the nearest even integer value and stores the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RoundToNearestEven(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_rint_ps (__m512 a)

Synopsis

__m512 _mm512_rint_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Rounds the packed single-precision (32-bit) floating-point elements in a to the nearest even integer value and stores the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := RoundToNearestEven(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vprold
__m128i _mm_mask_rol_epi32 (__m128i src, __mmask8 k, __m128i a, const int imm8)

Synopsis

__m128i _mm_mask_rol_epi32 (__m128i src, __mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprold
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vprold
__m128i _mm_maskz_rol_epi32 (__mmask8 k, __m128i a, const int imm8)

Synopsis

__m128i _mm_maskz_rol_epi32 (__mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprold
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vprold
__m128i _mm_rol_epi32 (__m128i a, int imm8)

Synopsis

__m128i _mm_rol_epi32 (__m128i a, int imm8)
#include "immintrin.h"
Instruction: vprold
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 3 i := j*32 dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
vprold
__m256i _mm256_mask_rol_epi32 (__m256i src, __mmask8 k, __m256i a, const int imm8)

Synopsis

__m256i _mm256_mask_rol_epi32 (__m256i src, __mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprold
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vprold
__m256i _mm256_maskz_rol_epi32 (__mmask8 k, __m256i a, const int imm8)

Synopsis

__m256i _mm256_maskz_rol_epi32 (__mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprold
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vprold
__m256i _mm256_rol_epi32 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_rol_epi32 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprold
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 7 i := j*32 dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
vprold
__m512i _mm512_mask_rol_epi32 (__m512i src, __mmask16 k, __m512i a, const int imm8)

Synopsis

__m512i _mm512_mask_rol_epi32 (__m512i src, __mmask16 k, __m512i a, const int imm8)
#include "immintrin.h"
Instruction: vprold zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vprold
__m512i _mm512_maskz_rol_epi32 (__mmask16 k, __m512i a, const int imm8)

Synopsis

__m512i _mm512_maskz_rol_epi32 (__mmask16 k, __m512i a, const int imm8)
#include "immintrin.h"
Instruction: vprold zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vprold
__m512i _mm512_rol_epi32 (__m512i a, const int imm8)

Synopsis

__m512i _mm512_rol_epi32 (__m512i a, const int imm8)
#include "immintrin.h"
Instruction: vprold zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 15 i := j*32 dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vprolq
__m128i _mm_mask_rol_epi64 (__m128i src, __mmask8 k, __m128i a, const int imm8)

Synopsis

__m128i _mm_mask_rol_epi64 (__m128i src, __mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vprolq
__m128i _mm_maskz_rol_epi64 (__mmask8 k, __m128i a, const int imm8)

Synopsis

__m128i _mm_maskz_rol_epi64 (__mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vprolq
__m128i _mm_rol_epi64 (__m128i a, const int imm8)

Synopsis

__m128i _mm_rol_epi64 (__m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 1 i := j*64 dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
vprolq
__m256i _mm256_mask_rol_epi64 (__m256i src, __mmask8 k, __m256i a, const int imm8)

Synopsis

__m256i _mm256_mask_rol_epi64 (__m256i src, __mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vprolq
__m256i _mm256_maskz_rol_epi64 (__mmask8 k, __m256i a, const int imm8)

Synopsis

__m256i _mm256_maskz_rol_epi64 (__mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vprolq
__m256i _mm256_rol_epi64 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_rol_epi64 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 3 i := j*64 dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
vprolq
__m512i _mm512_mask_rol_epi64 (__m512i src, __mmask8 k, __m512i a, const int imm8)

Synopsis

__m512i _mm512_mask_rol_epi64 (__m512i src, __mmask8 k, __m512i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vprolq
__m512i _mm512_maskz_rol_epi64 (__mmask8 k, __m512i a, const int imm8)

Synopsis

__m512i _mm512_maskz_rol_epi64 (__mmask8 k, __m512i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vprolq
__m512i _mm512_rol_epi64 (__m512i a, const int imm8)

Synopsis

__m512i _mm512_rol_epi64 (__m512i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 7 i := j*64 dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vprolvd
__m128i _mm_mask_rolv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_rolv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprolvd
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vprolvd
__m128i _mm_maskz_rolv_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_rolv_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprolvd
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vprolvd
__m128i _mm_rolv_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_rolv_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprolvd
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 3 i := j*32 dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:128] := 0
vprolvd
__m256i _mm256_mask_rolv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_rolv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprolvd
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vprolvd
__m256i _mm256_maskz_rolv_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_rolv_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprolvd
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vprolvd
__m256i _mm256_rolv_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_rolv_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprolvd
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 7 i := j*32 dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:256] := 0
vprolvd
__m512i _mm512_mask_rolv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_rolv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprolvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vprolvd
__m512i _mm512_maskz_rolv_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_rolv_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprolvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vprolvd
__m512i _mm512_rolv_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_rolv_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprolvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.

Operation

LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 15 i := j*32 dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
vprolvq
__m128i _mm_mask_rolv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_rolv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprolvq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vprolvq
__m128i _mm_maskz_rolv_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_rolv_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprolvq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vprolvq
__m128i _mm_rolv_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_rolv_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprolvq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 1 i := j*64 dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:128] := 0
vprolvq
__m256i _mm256_mask_rolv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_rolv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprolvq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vprolvq
__m256i _mm256_maskz_rolv_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_rolv_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprolvq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vprolvq
__m256i _mm256_rolv_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_rolv_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprolvq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 3 i := j*64 dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:256] := 0
vprolvq
__m512i _mm512_mask_rolv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_rolv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprolvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vprolvq
__m512i _mm512_maskz_rolv_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_rolv_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprolvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vprolvq
__m512i _mm512_rolv_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_rolv_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprolvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.

Operation

LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 7 i := j*64 dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
vprord
__m128i _mm_mask_ror_epi32 (__m128i src, __mmask8 k, __m128i a, const int imm8)

Synopsis

__m128i _mm_mask_ror_epi32 (__m128i src, __mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprord
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vprord
__m128i _mm_maskz_ror_epi32 (__mmask8 k, __m128i a, const int imm8)

Synopsis

__m128i _mm_maskz_ror_epi32 (__mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprord
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vprord
__m128i _mm_ror_epi32 (__m128i a, const int imm8)

Synopsis

__m128i _mm_ror_epi32 (__m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprord
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 3 i := j*32 dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
vprord
__m256i _mm256_mask_ror_epi32 (__m256i src, __mmask8 k, __m256i a, const int imm8)

Synopsis

__m256i _mm256_mask_ror_epi32 (__m256i src, __mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprord
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vprord
__m256i _mm256_maskz_ror_epi32 (__mmask8 k, __m256i a, const int imm8)

Synopsis

__m256i _mm256_maskz_ror_epi32 (__mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprord
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vprord
__m256i _mm256_ror_epi32 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_ror_epi32 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprord
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 7 i := j*32 dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
vprord
__m512i _mm512_mask_ror_epi32 (__m512i src, __mmask16 k, __m512i a, int imm8)

Synopsis

__m512i _mm512_mask_ror_epi32 (__m512i src, __mmask16 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vprord zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vprord
__m512i _mm512_maskz_ror_epi32 (__mmask16 k, __m512i a, int imm8)

Synopsis

__m512i _mm512_maskz_ror_epi32 (__mmask16 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vprord zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vprord
__m512i _mm512_ror_epi32 (__m512i a, int imm8)

Synopsis

__m512i _mm512_ror_epi32 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vprord zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 15 i := j*32 dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vprorq
__m128i _mm_mask_ror_epi64 (__m128i src, __mmask8 k, __m128i a, const int imm8)

Synopsis

__m128i _mm_mask_ror_epi64 (__m128i src, __mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprorq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vprorq
__m128i _mm_maskz_ror_epi64 (__mmask8 k, __m128i a, const int imm8)

Synopsis

__m128i _mm_maskz_ror_epi64 (__mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprorq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vprorq
__m128i _mm_ror_epi64 (__m128i a, const int imm8)

Synopsis

__m128i _mm_ror_epi64 (__m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprorq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 1 i := j*64 dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
vprorq
__m256i _mm256_mask_ror_epi64 (__m256i src, __mmask8 k, __m256i a, const int imm8)

Synopsis

__m256i _mm256_mask_ror_epi64 (__m256i src, __mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprorq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vprorq
__m256i _mm256_maskz_ror_epi64 (__mmask8 k, __m256i a, const int imm8)

Synopsis

__m256i _mm256_maskz_ror_epi64 (__mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprorq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vprorq
__m256i _mm256_ror_epi64 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_ror_epi64 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprorq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 3 i := j*64 dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
vprorq
__m512i _mm512_mask_ror_epi64 (__m512i src, __mmask8 k, __m512i a, int imm8)

Synopsis

__m512i _mm512_mask_ror_epi64 (__m512i src, __mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vprorq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vprorq
__m512i _mm512_maskz_ror_epi64 (__mmask8 k, __m512i a, int imm8)

Synopsis

__m512i _mm512_maskz_ror_epi64 (__mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vprorq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vprorq
__m512i _mm512_ror_epi64 (__m512i a, int imm8)

Synopsis

__m512i _mm512_ror_epi64 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vprorq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 7 i := j*64 dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vprorvd
__m128i _mm_mask_rorv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_rorv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprorvd
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vprorvd
__m128i _mm_maskz_rorv_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_rorv_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprorvd
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vprorvd
__m128i _mm_rorv_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_rorv_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprorvd
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 3 i := j*32 dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:128] := 0
vprorvd
__m256i _mm256_mask_rorv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_rorv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprorvd
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vprorvd
__m256i _mm256_maskz_rorv_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_rorv_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprorvd
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vprorvd
__m256i _mm256_rorv_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_rorv_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprorvd
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 7 i := j*32 dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:256] := 0
vprorvd
__m512i _mm512_mask_rorv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_rorv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprorvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vprorvd
__m512i _mm512_maskz_rorv_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_rorv_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprorvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vprorvd
__m512i _mm512_rorv_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_rorv_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprorvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.

Operation

RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 15 i := j*32 dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
vprorvq
__m128i _mm_mask_rorv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_rorv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprorvq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vprorvq
__m128i _mm_maskz_rorv_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_rorv_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprorvq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vprorvq
__m128i _mm_rorv_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_rorv_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprorvq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 1 i := j*64 dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:128] := 0
vprorvq
__m256i _mm256_mask_rorv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_rorv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprorvq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vprorvq
__m256i _mm256_maskz_rorv_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_rorv_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprorvq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vprorvq
__m256i _mm256_rorv_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_rorv_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprorvq
CPUID Flags: AVX512VL + AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 3 i := j*64 dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:256] := 0
vprorvq
__m512i _mm512_mask_rorv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_rorv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprorvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vprorvq
__m512i _mm512_maskz_rorv_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_rorv_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprorvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vprorvq
__m512i _mm512_rorv_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_rorv_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprorvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.

Operation

RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 7 i := j*64 dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
rol
unsigned int _rotl (unsigned int a, int shift)

Synopsis

unsigned int _rotl (unsigned int a, int shift)
#include "immintrin.h"
Instruction: rol r32, int

Description

Shift the bits of unsigned 32-bit integer a left by the number of bits specified in shift, rotating the most-significant bit to the least-significant bit location, and store the unsigned result in dst.

Operation

dst := a count := shift BITWISE AND 31 DO WHILE (count > 0) tmp[0] := dst[31] dst := (dst << 1) OR tmp[0] count := count - 1 OD

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge2-
Westmere1-
Nehalem1-
ror
unsigned int _rotr (unsigned int a, int shift)

Synopsis

unsigned int _rotr (unsigned int a, int shift)
#include "immintrin.h"
Instruction: ror r32, imm

Description

Shift the bits of unsigned 32-bit integer a right by the number of bits specified in shift, rotating the least-significant bit to the most-significant bit location, and store the unsigned result in dst.

Operation

dst := a count := shift BITWISE AND 31 DO WHILE (count > 0) tmp[31] := dst[0] dst := (dst >> 1) OR tmp[31] count := count - 1 OD

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge2-
Westmere1-
Nehalem1-
rol
unsigned short _rotwl (unsigned short a, int shift)

Synopsis

unsigned short _rotwl (unsigned short a, int shift)
#include "immintrin.h"
Instruction: rol r16, imm

Description

Shift the bits of unsigned 16-bit integer a left by the number of bits specified in shift, rotating the most-significant bit to the least-significant bit location, and store the unsigned result in dst.

Operation

dst := a count := shift BITWISE AND 15 DO WHILE (count > 0) tmp[0] := dst[15] dst := (dst << 1) OR tmp[0] count := count - 1 OD

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge2-
Westmere1-
Nehalem1-
ror
unsigned short _rotwr (unsigned short a, int shift)

Synopsis

unsigned short _rotwr (unsigned short a, int shift)
#include "immintrin.h"
Instruction: ror r16, imm

Description

Shift the bits of unsigned 16-bit integer a right by the number of bits specified in shift, rotating the least-significant bit to the most-significant bit location, and store the unsigned result in dst.

Operation

dst := a count := shift BITWISE AND 15 DO WHILE (count > 0) tmp[15] := dst[0] dst := (dst >> 1) OR tmp[15] count := count - 1 OD

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge2-
Westmere1-
Nehalem1-
roundpd
__m128d _mm_round_pd (__m128d a, int rounding)

Synopsis

__m128d _mm_round_pd (__m128d a, int rounding)
#include "smmintrin.h"
Instruction: roundpd xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Round the packed double-precision (64-bit) floating-point elements in a using the rounding parameter, and store the results as packed double-precision floating-point elements in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ROUND(a[i+63:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell62
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vroundpd
__m256d _mm256_round_pd (__m256d a, int rounding)

Synopsis

__m256d _mm256_round_pd (__m256d a, int rounding)
#include "immintrin.h"
Instruction: vroundpd ymm, ymm, imm
CPUID Flags: AVX

Description

Round the packed double-precision (64-bit) floating-point elements in a using the rounding parameter, and store the results as packed double-precision floating-point elements in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ROUND(a[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell61
Ivy Bridge31
Sandy Bridge31
roundps
__m128 _mm_round_ps (__m128 a, int rounding)

Synopsis

__m128 _mm_round_ps (__m128 a, int rounding)
#include "smmintrin.h"
Instruction: roundps xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Round the packed single-precision (32-bit) floating-point elements in a using the rounding parameter, and store the results as packed single-precision floating-point elements in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ROUND(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell62
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vroundps
__m256 _mm256_round_ps (__m256 a, int rounding)

Synopsis

__m256 _mm256_round_ps (__m256 a, int rounding)
#include "immintrin.h"
Instruction: vroundps ymm, ymm, imm
CPUID Flags: AVX

Description

Round the packed single-precision (32-bit) floating-point elements in a using the rounding parameter, and store the results as packed single-precision floating-point elements in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ROUND(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell61
Ivy Bridge31
Sandy Bridge31
vroundps
__m512 _mm512_mask_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)

Synopsis

__m512 _mm512_mask_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vroundps zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Round the packed single-precision (32-bit) floating-point elements in a to the nearest integer value using expadj and in the direction of rounding, and store the results as packed single-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ROUND(a[i+31:i]) CASE expadj OF _MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0 _MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4 _MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5 _MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8 _MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16 _MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24 _MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31 _MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32 ESAC ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vroundps
__m512 _mm512_round_ps (__m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)

Synopsis

__m512 _mm512_round_ps (__m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vroundps zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Round the packed single-precision (32-bit) floating-point elements in a to the nearest integer value using expadj and in the direction of rounding, and store the results as packed single-precision floating-point elements in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ROUND(a[i+31:i]) CASE expadj OF _MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0 _MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4 _MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5 _MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8 _MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16 _MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24 _MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31 _MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32 ESAC ENDFOR dst[MAX:512] := 0
roundsd
__m128d _mm_round_sd (__m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_round_sd (__m128d a, __m128d b, int rounding)
#include "smmintrin.h"
Instruction: roundsd xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Round the lower double-precision (64-bit) floating-point element in b using the rounding parameter, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := ROUND(b[63:0]) dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell62
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
roundss
__m128 _mm_round_ss (__m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_round_ss (__m128 a, __m128 b, int rounding)
#include "smmintrin.h"
Instruction: roundss xmm, xmm, imm
CPUID Flags: SSE4.1

Description

Round the lower single-precision (32-bit) floating-point element in b using the rounding parameter, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := ROUND(b[31:0]) dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell62
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vrndfxpntpd
__m512d _mm512_mask_roundfxpnt_adjust_pd (__m512d src, __mmask8 k, __m512d a, int rounding, _MM_EXP_ADJ_ENUM expadj)

Synopsis

__m512d _mm512_mask_roundfxpnt_adjust_pd (__m512d src, __mmask8 k, __m512d a, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vrndfxpntpd zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in a using expadj and in the direction of rounding and stores results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ROUND(a[i+63:i]) CASE expadj OF _MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0 _MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4 _MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5 _MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8 _MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16 _MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24 _MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31 _MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32 ESAC ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vrndfxpntpd
__m512d _mm512_roundfxpnt_adjust_pd (__m512d a, int rounding, _MM_EXP_ADJ_ENUM expadj)

Synopsis

__m512d _mm512_roundfxpnt_adjust_pd (__m512d a, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vrndfxpntpd zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in a using expadj and in the direction of rounding and stores results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ROUND(a[i+63:i]) CASE expadj OF _MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0 _MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4 _MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5 _MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8 _MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16 _MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24 _MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31 _MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32 ESAC ENDFOR dst[MAX:512] := 0
vrndfxpntps
__m512 _mm512_mask_roundfxpnt_adjust_ps (__m512 src, __mmask16 k, __m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)

Synopsis

__m512 _mm512_mask_roundfxpnt_adjust_ps (__m512 src, __mmask16 k, __m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vrndfxpntps zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in a using expadj and in the direction of rounding and stores results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ROUND(a[i+31:i]) CASE expadj OF _MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0 _MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4 _MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5 _MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8 _MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16 _MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24 _MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31 _MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32 ESAC ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vrndfxpntps
__m512 _mm512_roundfxpnt_adjust_ps (__m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)

Synopsis

__m512 _mm512_roundfxpnt_adjust_ps (__m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vrndfxpntps zmm {k}, m512, imm
CPUID Flags: KNCNI

Description

Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in a using expadj and in the direction of rounding and stores results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ROUND(a[i+31:i]) CASE expadj OF _MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0 _MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4 _MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5 _MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8 _MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16 _MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24 _MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31 _MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32 ESAC ENDFOR dst[MAX:512] := 0
vrndscalepd
__m128d _mm_mask_roundscale_pd (__m128d src, __mmask8 k, __m128d a, int imm8)

Synopsis

__m128d _mm_mask_roundscale_pd (__m128d src, __mmask8 k, __m128d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd
CPUID Flags: AVX512VL + AVX512F

Description

Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vrndscalepd
__m128d _mm_maskz_roundscale_pd (__mmask8 k, __m128d a, int imm8)

Synopsis

__m128d _mm_maskz_roundscale_pd (__mmask8 k, __m128d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd
CPUID Flags: AVX512VL + AVX512F

Description

Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vrndscalepd
__m128d _mm_roundscale_pd (__m128d a, int imm8)

Synopsis

__m128d _mm_roundscale_pd (__m128d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd
CPUID Flags: AVX512VL + AVX512F

Description

Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 1 i := j*64 dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
vrndscalepd
__m256d _mm256_mask_roundscale_pd (__m256d src, __mmask8 k, __m256d a, int imm8)

Synopsis

__m256d _mm256_mask_roundscale_pd (__m256d src, __mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd
CPUID Flags: AVX512VL + AVX512F

Description

Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vrndscalepd
__m256d _mm256_maskz_roundscale_pd (__mmask8 k, __m256d a, int imm8)

Synopsis

__m256d _mm256_maskz_roundscale_pd (__mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd
CPUID Flags: AVX512VL + AVX512F

Description

Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vrndscalepd
__m256d _mm256_roundscale_pd (__m256d a, int imm8)

Synopsis

__m256d _mm256_roundscale_pd (__m256d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd
CPUID Flags: AVX512VL + AVX512F

Description

Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 3 i := j*64 dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
vrndscalepd
__m512d _mm512_mask_roundscale_pd (__m512d src, __mmask8 k, __m512d a, int imm8)

Synopsis

__m512d _mm512_mask_roundscale_pd (__m512d src, __mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vrndscalepd
__m512d _mm512_maskz_roundscale_pd (__mmask8 k, __m512d a, int imm8)

Synopsis

__m512d _mm512_maskz_roundscale_pd (__mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vrndscalepd
__m512d _mm512_roundscale_pd (__m512d a, int imm8)

Synopsis

__m512d _mm512_roundscale_pd (__m512d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vrndscaleps
__m128 _mm_mask_roundscale_ps (__m128 src, __mmask8 k, __m128 a, int imm8)

Synopsis

__m128 _mm_mask_roundscale_ps (__m128 src, __mmask8 k, __m128 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps
CPUID Flags: AVX512VL + AVX512F

Description

Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vrndscaleps
__m128 _mm_maskz_roundscale_ps (__mmask8 k, __m128 a, int imm8)

Synopsis

__m128 _mm_maskz_roundscale_ps (__mmask8 k, __m128 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps
CPUID Flags: AVX512VL + AVX512F

Description

Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vrndscaleps
__m128 _mm_roundscale_ps (__m128 a, int imm8)

Synopsis

__m128 _mm_roundscale_ps (__m128 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps
CPUID Flags: AVX512VL + AVX512F

Description

Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 3 i := j*32 dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
vrndscaleps
__m256 _mm256_mask_roundscale_ps (__m256 src, __mmask8 k, __m256 a, int imm8)

Synopsis

__m256 _mm256_mask_roundscale_ps (__m256 src, __mmask8 k, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps
CPUID Flags: AVX512VL + AVX512F

Description

Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vrndscaleps
__m256 _mm256_maskz_roundscale_ps (__mmask8 k, __m256 a, int imm8)

Synopsis

__m256 _mm256_maskz_roundscale_ps (__mmask8 k, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps
CPUID Flags: AVX512VL + AVX512F

Description

Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vrndscaleps
__m256 _mm256_roundscale_ps (__m256 a, int imm8)

Synopsis

__m256 _mm256_roundscale_ps (__m256 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps
CPUID Flags: AVX512VL + AVX512F

Description

Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 7 i := j*32 dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
vrndscaleps
__m512 _mm512_mask_roundscale_ps (__m512 src, __mmask16 k, __m512 a, int imm8)

Synopsis

__m512 _mm512_mask_roundscale_ps (__m512 src, __mmask16 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vrndscaleps
__m512 _mm512_maskz_roundscale_ps (__mmask16 k, __m512 a, int imm8)

Synopsis

__m512 _mm512_maskz_roundscale_ps (__mmask16 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vrndscaleps
__m512 _mm512_roundscale_ps (__m512 a, int imm8)

Synopsis

__m512 _mm512_roundscale_ps (__m512 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vrndscalepd
__m512d _mm512_mask_roundscale_round_pd (__m512d src, __mmask8 k, __m512d a, int imm8, int rounding)

Synopsis

__m512d _mm512_mask_roundscale_round_pd (__m512d src, __mmask8 k, __m512d a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrndscalepd zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F

Description

Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vrndscalepd
__m512d _mm512_maskz_roundscale_round_pd (__mmask8 k, __m512d a, int imm8, int rounding)

Synopsis

__m512d _mm512_maskz_roundscale_round_pd (__mmask8 k, __m512d a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrndscalepd zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F

Description

Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vrndscalepd
__m512d _mm512_roundscale_round_pd (__m512d a, int imm8, int rounding)

Synopsis

__m512d _mm512_roundscale_round_pd (__m512d a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrndscalepd zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F

Description

Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vrndscaleps
__m512 _mm512_mask_roundscale_round_ps (__m512 src, __mmask16 k, __m512 a, int imm8, int rounding)

Synopsis

__m512 _mm512_mask_roundscale_round_ps (__m512 src, __mmask16 k, __m512 a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrndscaleps zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F

Description

Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vrndscaleps
__m512 _mm512_maskz_roundscale_round_ps (__mmask16 k, __m512 a, int imm8, int rounding)

Synopsis

__m512 _mm512_maskz_roundscale_round_ps (__mmask16 k, __m512 a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrndscaleps zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F

Description

Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vrndscaleps
__m512 _mm512_roundscale_round_ps (__m512 a, int imm8, int rounding)

Synopsis

__m512 _mm512_roundscale_round_ps (__m512 a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrndscaleps zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F

Description

Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
vrndscalesd
__m128d _mm_mask_roundscale_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, const int imm8, const int rounding)

Synopsis

__m128d _mm_mask_roundscale_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, const int imm8, const int rounding)
#include "immintrin.h"
Instruction: vrndscalesd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Round the lower double-precision (64-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } IF k[0] dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vrndscalesd
__m128d _mm_maskz_roundscale_round_sd (__mmask8 k, __m128d a, __m128d b, const int imm8, const int rounding)

Synopsis

__m128d _mm_maskz_roundscale_round_sd (__mmask8 k, __m128d a, __m128d b, const int imm8, const int rounding)
#include "immintrin.h"
Instruction: vrndscalesd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Round the lower double-precision (64-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } IF k[0] dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vrndscalesd
__m128d _mm_roundscale_round_sd (__m128d a, __m128d b, const int imm8, const int rounding)

Synopsis

__m128d _mm_roundscale_round_sd (__m128d a, __m128d b, const int imm8, const int rounding)
#include "immintrin.h"
Instruction: vrndscalesd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Round the lower double-precision (64-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0]) dst[127:64] := b[127:64] dst[MAX:128] := 0
vrndscaless
__m128 _mm_mask_roundscale_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, const int imm8, const int rounding)

Synopsis

__m128 _mm_mask_roundscale_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, const int imm8, const int rounding)
#include "immintrin.h"
Instruction: vrndscaless xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Round the lower single-precision (32-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } IF k[0] dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vrndscaless
__m128 _mm_maskz_roundscale_round_ss (__mmask8 k, __m128 a, __m128 b, const int imm8, const int rounding)

Synopsis

__m128 _mm_maskz_roundscale_round_ss (__mmask8 k, __m128 a, __m128 b, const int imm8, const int rounding)
#include "immintrin.h"
Instruction: vrndscaless xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Round the lower single-precision (32-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } IF k[0] dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0]) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vrndscaless
__m128 _mm_roundscale_round_ss (__m128 a, __m128 b, const int imm8, const int rounding)

Synopsis

__m128 _mm_roundscale_round_ss (__m128 a, __m128 b, const int imm8, const int rounding)
#include "immintrin.h"
Instruction: vrndscaless xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F

Description

Round the lower single-precision (32-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from b to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0]) dst[127:32] := b[127:32] dst[MAX:128] := 0
vrndscalesd
__m128d _mm_mask_roundscale_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, const int imm8)

Synopsis

__m128d _mm_mask_roundscale_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vrndscalesd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Round the lower double-precision (64-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } IF k[0] dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vrndscalesd
__m128d _mm_maskz_roundscale_sd (__mmask8 k, __m128d a, __m128d b, const int imm8)

Synopsis

__m128d _mm_maskz_roundscale_sd (__mmask8 k, __m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vrndscalesd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Round the lower double-precision (64-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } IF k[0] dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vrndscalesd
__m128d _mm_roundscale_sd (__m128d a, __m128d b, const int imm8)

Synopsis

__m128d _mm_roundscale_sd (__m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vrndscalesd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Round the lower double-precision (64-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst.

Operation

RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0]) dst[127:64] := b[127:64] dst[MAX:128] := 0
vrndscaless
__m128 _mm_mask_roundscale_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, const int imm8)

Synopsis

__m128 _mm_mask_roundscale_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vrndscaless xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Round the lower single-precision (32-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } IF k[0] dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vrndscaless
__m128 _mm_maskz_roundscale_ss (__mmask8 k, __m128 a, __m128 b, const int imm8)

Synopsis

__m128 _mm_maskz_roundscale_ss (__mmask8 k, __m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vrndscaless xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Round the lower single-precision (32-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } IF k[0] dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0]) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vrndscaless
__m128 _mm_roundscale_ss (__m128 a, __m128 b, const int imm8)

Synopsis

__m128 _mm_roundscale_ss (__m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vrndscaless xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F

Description

Round the lower single-precision (32-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from b to the upper elements of dst.

Operation

RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0]) dst[127:32] := b[127:32] dst[MAX:128] := 0
rsqrtps
__m128 _mm_rsqrt_ps (__m128 a)

Synopsis

__m128 _mm_rsqrt_ps (__m128 a)
#include "xmmintrin.h"
Instruction: rsqrtps xmm, xmm
CPUID Flags: SSE

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 1.5*2^-12.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge51
Sandy Bridge51
Westmere32
Nehalem32
vrsqrtps
__m256 _mm256_rsqrt_ps (__m256 a)

Synopsis

__m256 _mm256_rsqrt_ps (__m256 a)
#include "immintrin.h"
Instruction: vrsqrtps ymm, ymm
CPUID Flags: AVX

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 1.5*2^-12.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell71
Ivy Bridge71
Sandy Bridge71
rsqrtss
__m128 _mm_rsqrt_ss (__m128 a)

Synopsis

__m128 _mm_rsqrt_ss (__m128 a)
#include "xmmintrin.h"
Instruction: rsqrtss xmm, xmm
CPUID Flags: SSE

Description

Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 1.5*2^-12.

Operation

dst[31:0] := APPROXIMATE(1.0 / SQRT(a[31:0])) dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge51
Sandy Bridge51
Westmere33
Nehalem33
vrsqrt14pd
__m128d _mm_mask_rsqrt14_pd (__m128d src, __mmask8 k, __m128d a)

Synopsis

__m128d _mm_mask_rsqrt14_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vrsqrt14pd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vrsqrt14pd
__m128d _mm_maskz_rsqrt14_pd (__mmask8 k, __m128d a)

Synopsis

__m128d _mm_maskz_rsqrt14_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vrsqrt14pd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vrsqrt14pd
__m256d _mm256_mask_rsqrt14_pd (__m256d src, __mmask8 k, __m256d a)

Synopsis

__m256d _mm256_mask_rsqrt14_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vrsqrt14pd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vrsqrt14pd
__m256d _mm256_maskz_rsqrt14_pd (__mmask8 k, __m256d a)

Synopsis

__m256d _mm256_maskz_rsqrt14_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vrsqrt14pd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vrsqrt14pd
__m512d _mm512_mask_rsqrt14_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_rsqrt14_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrsqrt14pd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vrsqrt14pd
__m512d _mm512_maskz_rsqrt14_pd (__mmask8 k, __m512d a)

Synopsis

__m512d _mm512_maskz_rsqrt14_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrsqrt14pd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vrsqrt14pd
__m512d _mm512_rsqrt14_pd (__m512d a)

Synopsis

__m512d _mm512_rsqrt14_pd (__m512d a)
#include "immintrin.h"
Instruction: vrsqrt14pd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ENDFOR dst[MAX:512] := 0
vrsqrt14ps
__m128 _mm_mask_rsqrt14_ps (__m128 src, __mmask8 k, __m128 a)

Synopsis

__m128 _mm_mask_rsqrt14_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vrsqrt14ps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vrsqrt14ps
__m128 _mm_maskz_rsqrt14_ps (__mmask8 k, __m128 a)

Synopsis

__m128 _mm_maskz_rsqrt14_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vrsqrt14ps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vrsqrt14ps
__m256 _mm256_mask_rsqrt14_ps (__m256 src, __mmask8 k, __m256 a)

Synopsis

__m256 _mm256_mask_rsqrt14_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vrsqrt14ps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vrsqrt14ps
__m256 _mm256_maskz_rsqrt14_ps (__mmask8 k, __m256 a)

Synopsis

__m256 _mm256_maskz_rsqrt14_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vrsqrt14ps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vrsqrt14ps
__m512 _mm512_mask_rsqrt14_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_rsqrt14_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrsqrt14ps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vrsqrt14ps
__m512 _mm512_maskz_rsqrt14_ps (__mmask16 k, __m512 a)

Synopsis

__m512 _mm512_maskz_rsqrt14_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrsqrt14ps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vrsqrt14ps
__m512 _mm512_rsqrt14_ps (__m512 a)

Synopsis

__m512 _mm512_rsqrt14_ps (__m512 a)
#include "immintrin.h"
Instruction: vrsqrt14ps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ENDFOR dst[MAX:512] := 0
vrsqrt14sd
__m128d _mm_mask_rsqrt14_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_rsqrt14_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrsqrt14sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.

Operation

IF k[0] dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0])) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vrsqrt14sd
__m128d _mm_maskz_rsqrt14_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_rsqrt14_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrsqrt14sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.

Operation

IF k[0] dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0])) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vrsqrt14sd
__m128d _mm_rsqrt14_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_rsqrt14_sd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrsqrt14sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.

Operation

dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0])) dst[127:64] := a[127:64] dst[MAX:128] := 0
vrsqrt14ss
__m128 _mm_mask_rsqrt14_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_rsqrt14_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrsqrt14ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.

Operation

IF k[0] dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0])) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vrsqrt14ss
__m128 _mm_maskz_rsqrt14_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_rsqrt14_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrsqrt14ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.

Operation

IF k[0] dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0])) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vrsqrt14ss
__m128 _mm_rsqrt14_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_rsqrt14_ss (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrsqrt14ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.

Operation

dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0])) dst[127:32] := a[127:32] dst[MAX:128] := 0
vrsqrt23ps
__m512 _mm512_mask_rsqrt23_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_rsqrt23_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrsqrt23ps zmm {k}, m512
CPUID Flags: KNCNI

Description

Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in a to 23 bits of accuracy and stores the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Sqrt(1.0 / a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vrsqrt23ps
__m512 _mm512_rsqrt23_ps (__m512 a)

Synopsis

__m512 _mm512_rsqrt23_ps (__m512 a)
#include "immintrin.h"
Instruction: vrsqrt23ps zmm {k}, m512
CPUID Flags: KNCNI

Description

Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in a to 23 bits of accuracy and stores the result in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := Sqrt(1.0 / a[i+31:i]) ENDFOR dst[MAX:512] := 0
vrsqrt28pd
__m512d _mm512_mask_rsqrt28_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_rsqrt28_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrsqrt28pd zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.

Operation

FOR j := 0 to 7 i := j*64; IF k[j] THEN dst[i+63:i] := (1.0/SQRT(a[i+63:i])); ELSE dst[i+63:i] := src[i+63:i]; FI ENDFOR;
vrsqrt28pd
__m512d _mm512_maskz_rsqrt28_pd (__mmask8 k, __m512d a)

Synopsis

__m512d _mm512_maskz_rsqrt28_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrsqrt28pd zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.

Operation

FOR j := 0 to 7 i := j*64; IF k[j] THEN dst[i+63:i] := (1.0/SQRT(a[i+63:i])); ELSE dst[i+63:i] := 0; FI ENDFOR;
vrsqrt28pd
__m512d _mm512_rsqrt28_pd (__m512d a)

Synopsis

__m512d _mm512_rsqrt28_pd (__m512d a)
#include "immintrin.h"
Instruction: vrsqrt28pd zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, store the results in dst. The maximum relative error for this approximation is less than 2^-28.

Operation

FOR j := 0 to 7 i := j*64; dst[i+63:i] := (1.0/SQRT(a[i+63:i])); ENDFOR;
vrsqrt28ps
__m512 _mm512_mask_rsqrt28_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_rsqrt28_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrsqrt28ps zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.

Operation

FOR j := 0 to 15 i := j*32; IF k[j] THEN dst[i+31:i] := (1.0/SQRT(a[i+31:i])); ELSE dst[i+31:i] := src[i+31:i]; FI ENDFOR;
vrsqrt28ps
__m512 _mm512_maskz_rsqrt28_ps (__mmask16 k, __m512 a)

Synopsis

__m512 _mm512_maskz_rsqrt28_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrsqrt28ps zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.

Operation

FOR j := 0 to 15 i := j*32; IF k[j] THEN dst[i+31:i] := (1.0/SQRT(a[i+31:i])); ELSE dst[i+31:i] := 0; FI ENDFOR;
vrsqrt28ps
__m512 _mm512_rsqrt28_ps (__m512 a)

Synopsis

__m512 _mm512_rsqrt28_ps (__m512 a)
#include "immintrin.h"
Instruction: vrsqrt28ps zmm {k}, zmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, store the results in dst. The maximum relative error for this approximation is less than 2^-28.

Operation

FOR j := 0 to 15 i := j*32; dst[i+31:i] := (1.0/SQRT(a[i+31:i])); ENDFOR;
vrsqrt28pd
__m512d _mm512_mask_rsqrt28_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)

Synopsis

__m512d _mm512_mask_rsqrt28_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64; IF k[j] THEN dst[i+63:i] := (1.0/SQRT(a[i+63:i])); ELSE dst[i+63:i] := src[i+63:i]; FI ENDFOR;
vrsqrt28pd
__m512d _mm512_maskz_rsqrt28_round_pd (__mmask8 k, __m512d a, int rounding)

Synopsis

__m512d _mm512_maskz_rsqrt28_round_pd (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64; IF k[j] THEN dst[i+63:i] := (1.0/SQRT(a[i+63:i])); ELSE dst[i+63:i] := 0; FI ENDFOR;
vrsqrt28pd
__m512d _mm512_rsqrt28_round_pd (__m512d a, int rounding)

Synopsis

__m512d _mm512_rsqrt28_round_pd (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, store the results in dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64; dst[i+63:i] := (1.0/SQRT(a[i+63:i])); ENDFOR;
vrsqrt28ps
__m512 _mm512_mask_rsqrt28_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)

Synopsis

__m512 _mm512_mask_rsqrt28_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32; IF k[j] THEN dst[i+31:i] := (1.0/SQRT(a[i+31:i])); ELSE dst[i+31:i] := src[i+31:i]; FI ENDFOR;
vrsqrt28ps
__m512 _mm512_maskz_rsqrt28_round_ps (__mmask16 k, __m512 a, int rounding)

Synopsis

__m512 _mm512_maskz_rsqrt28_round_ps (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32; IF k[j] THEN dst[i+31:i] := (1.0/SQRT(a[i+31:i])); ELSE dst[i+31:i] := 0; FI ENDFOR;
vrsqrt28ps
__m512 _mm512_rsqrt28_round_ps (__m512 a, int rounding)

Synopsis

__m512 _mm512_rsqrt28_round_ps (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, store the results in dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32; dst[i+31:i] := (1.0/SQRT(a[i+31:i])); ENDFOR;
vrsqrt28sd
__m128d _mm_mask_rsqrt28_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_mask_rsqrt28_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] THEN dst[63:0] := (1.0/SQRT(b[63:0])); ELSE dst[63:0] := src[63:0]; FI dst[127:64] := a[127:64]; dst[MAX:128] := 0;
vrsqrt28sd
__m128d _mm_maskz_rsqrt28_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_maskz_rsqrt28_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] THEN dst[63:0] := (1.0/SQRT(b[63:0])); ELSE dst[63:0] := 0; FI dst[127:64] := a[127:64]; dst[MAX:128] := 0;
vrsqrt28sd
__m128d _mm_rsqrt28_round_sd (__m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_rsqrt28_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := (1.0/SQRT(b[63:0])); dst[127:64] := a[127:64]; dst[MAX:128] := 0;
vrsqrt28ss
__m128 _mm_mask_rsqrt28_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_mask_rsqrt28_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] THEN dst[31:0] := (1.0/SQRT(b[31:0])); ELSE dst[31:0] := src[31:0]; FI dst[127:32] := a[127:32]; dst[MAX:128] := 0;
vrsqrt28ss
__m128 _mm_maskz_rsqrt28_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_maskz_rsqrt28_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] THEN dst[31:0] := (1.0/SQRT(b[31:0])); ELSE dst[31:0] := 0; FI dst[127:32] := a[127:32]; dst[MAX:128] := 0;
vrsqrt28ss
__m128 _mm_rsqrt28_round_ss (__m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_rsqrt28_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := (1.0/SQRT(b[31:0])); dst[127:32] := a[127:32]; dst[MAX:128] := 0;
vrsqrt28sd
__m128d _mm_mask_rsqrt28_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_rsqrt28_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrsqrt28sd xmm {k}, xmm, xmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.

Operation

IF k[0] THEN dst[63:0] := (1.0/SQRT(b[63:0])); ELSE dst[63:0] := src[63:0]; FI dst[127:64] := a[127:64]; dst[MAX:128] := 0;
vrsqrt28sd
__m128d _mm_maskz_rsqrt28_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_rsqrt28_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrsqrt28sd xmm {k}, xmm, xmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.

Operation

IF k[0] THEN dst[63:0] := (1.0/SQRT(b[63:0])); ELSE dst[63:0] := 0; FI dst[127:64] := a[127:64]; dst[MAX:128] := 0;
vrsqrt28sd
__m128d _mm_rsqrt28_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_rsqrt28_sd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrsqrt28sd xmm {k}, xmm, xmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.

Operation

dst[63:0] := (1.0/SQRT(b[63:0])); dst[127:64] := a[127:64]; dst[MAX:128] := 0;
vrsqrt28ss
__m128 _mm_mask_rsqrt28_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_rsqrt28_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrsqrt28ss xmm {k}, xmm, xmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.

Operation

IF k[0] THEN dst[31:0] := (1.0/SQRT(b[31:0])); ELSE dst[31:0] := src[31:0]; FI dst[127:32] := a[127:32]; dst[MAX:128] := 0;
vrsqrt28ss
__m128 _mm_maskz_rsqrt28_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_rsqrt28_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrsqrt28ss xmm {k}, xmm, xmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.

Operation

IF k[0] THEN dst[31:0] := (1.0/SQRT(b[31:0])); ELSE dst[31:0] := 0; FI dst[127:32] := a[127:32]; dst[MAX:128] := 0;
vrsqrt28ss
__m128 _mm_rsqrt28_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_rsqrt28_ss (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrsqrt28ss xmm {k}, xmm, xmm
CPUID Flags: AVX512ER

Description

Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.

Operation

dst[31:0] := (1.0/SQRT(b[31:0])); dst[127:32] := a[127:32]; dst[MAX:128] := 0;
psadbw
__m128i _mm_sad_epu8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sad_epu8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psadbw xmm, xmm
CPUID Flags: SSE2

Description

Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst.

Operation

FOR j := 0 to 15 i := j*8 tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) ENDFOR FOR j := 0 to 1 i := j*64 dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] dst[i+63:i+16] := 0 ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell51
Ivy Bridge51
Sandy Bridge51
Westmere51
Nehalem51
vpsadbw
__m256i _mm256_sad_epu8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_sad_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsadbw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst.

Operation

FOR j := 0 to 31 i := j*8 tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) ENDFOR FOR j := 0 to 4 i := j*64 dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] dst[i+63:i+16] := 0 ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell51
vpsadbw
__m512i _mm512_sad_epu8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_sad_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsadbw
CPUID Flags: AVX512BW

Description

Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst.

Operation

FOR j := 0 to 63 i := j*8 tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) ENDFOR FOR j := 0 to 7 i := j*64 dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] dst[i+63:i+16] := 0 ENDFOR dst[MAX:512] := 0
psadbw
__m64 _mm_sad_pu8 (__m64 a, __m64 b)

Synopsis

__m64 _mm_sad_pu8 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: psadbw mm, mm
CPUID Flags: SSE

Description

Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of dst.

Operation

FOR j := 0 to 7 i := j*8 tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) ENDFOR dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
vpsbbd
__m512i _mm512_mask_sbb_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * borrow)

Synopsis

__m512i _mm512_mask_sbb_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsbbd zmm {k}, k, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element three-input subtraction of packed 32-bit integer elements of v3 as well as the corresponding bit from k2 from v2. The borrowed value from the subtraction difference for the nth element is written to the nth bit of borrow (borrow flag). Results are stored in dst using writemask k1 (elements are copied from v2 when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k2[j] borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k2[j]) ELSE dst[i+31:i] := v2[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpsbbd
__m512i _mm512_sbb_epi32 (__m512i v2, __mmask16 k, __m512i v3, __mmask16 * borrow)

Synopsis

__m512i _mm512_sbb_epi32 (__m512i v2, __mmask16 k, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsbbd zmm {k}, k, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element three-input subtraction of packed 32-bit integer elements of v3 as well as the corresponding bit from k from v2. The borrowed value from the subtraction difference for the nth element is written to the nth bit of borrow (borrow flag). Results are stored in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k[j] borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j]) ENDFOR dst[MAX:512] := 0
vpsbbrd
__m512i _mm512_mask_sbbr_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * borrow)

Synopsis

__m512i _mm512_mask_sbbr_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsbbrd zmm {k}, k, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element three-input subtraction of packed 32-bit integer elements of v2 as well as the corresponding bit from k2 from v3. The borrowed value from the subtraction difference for the nth element is written to the nth bit of borrow (borrow flag). Results are stored in dst using writemask k1 (elements are copied from v2 when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k2[j] borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j]) ELSE dst[i+31:i] := v2[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpsbbrd
__m512i _mm512_sbbr_epi32 (__m512i v2, __mmask16 k, __m512i v3, __mmask16 * borrow)

Synopsis

__m512i _mm512_sbbr_epi32 (__m512i v2, __mmask16 k, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsbbrd zmm {k}, k, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element three-input subtraction of packed 32-bit integer elements of v2 as well as the corresponding bit from k from v3. The borrowed value from the subtraction difference for the nth element is written to the nth bit of borrow (borrow flag). Results are stored in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k[j] borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j]) ENDFOR dst[MAX:512] := 0
vscaleps
__m512 _mm512_mask_scale_ps (__m512 src, __mmask16 k, __m512 a, __m512i b)

Synopsis

__m512 _mm512_mask_scale_ps (__m512 src, __mmask16 k, __m512 a, __m512i b)
#include "immintrin.h"
Instruction: vscaleps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Scales each single-precision (32-bit) floating-point element in a by multiplying it by 2**exponent, where the exponenet is the corresponding 32-bit integer element in b, storing results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vscaleps
__m512 _mm512_scale_ps (__m512 a, __m512i b)

Synopsis

__m512 _mm512_scale_ps (__m512 a, __m512i b)
#include "immintrin.h"
Instruction: vscaleps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Scales each single-precision (32-bit) floating-point element in a by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in b, storing results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i]) ENDFOR dst[MAX:512] := 0
vscaleps
__m512 _mm512_mask_scale_round_ps (__m512 src, __mmask16 k, __m512 a, __m512i b, int rounding)

Synopsis

__m512 _mm512_mask_scale_round_ps (__m512 src, __mmask16 k, __m512 a, __m512i b, int rounding)
#include "immintrin.h"
Instruction: vscaleps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Scales each single-precision (32-bit) floating-point element in a by multiplying it by 2**exp, where the exp is the corresponding 32-bit integer element in b, storing results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Results are rounded using constant rounding.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_scale_round_ps (__m512 a, __m512i b, int rounding)

Synopsis

__m512 _mm512_scale_round_ps (__m512 a, __m512i b, int rounding)
#include "immintrin.h"
Instruction: vscaleps zmm {k}, zmm, m512
CPUID Flags: KNCNI

Description

Scales each single-precision (32-bit) floating-point element in a by multiplying it by 2**exponent, where the exponenet is the corresponding 32-bit integer element in b, storing results in dst. Intermediate elements are rounded using rounding.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i]) ENDFOR dst[MAX:512] := 0
vscalefpd
__m128d _mm_mask_scalef_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_scalef_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vscalefpd
CPUID Flags: AVX512VL + AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vscalefpd
__m128d _mm_maskz_scalef_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_scalef_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vscalefpd
CPUID Flags: AVX512VL + AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vscalefpd
__m128d _mm_scalef_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_scalef_pd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vscalefpd
CPUID Flags: AVX512VL + AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 1 i := j*64 dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ENDFOR dst[MAX:128] := 0
vscalefpd
__m256d _mm256_mask_scalef_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_scalef_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vscalefpd
CPUID Flags: AVX512VL + AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vscalefpd
__m256d _mm256_maskz_scalef_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_scalef_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vscalefpd
CPUID Flags: AVX512VL + AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vscalefpd
__m256d _mm256_scalef_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_scalef_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vscalefpd
CPUID Flags: AVX512VL + AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 3 i := j*64 dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ENDFOR dst[MAX:256] := 0
vscalefpd
__m512d _mm512_mask_scalef_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_scalef_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vscalefpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vscalefpd
__m512d _mm512_maskz_scalef_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_scalef_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vscalefpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vscalefpd
__m512d _mm512_scalef_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_scalef_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vscalefpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ENDFOR dst[MAX:512] := 0
vscalefps
__m128 _mm_mask_scalef_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_scalef_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vscalefps
CPUID Flags: AVX512VL + AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vscalefps
__m128 _mm_maskz_scalef_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_scalef_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vscalefps
CPUID Flags: AVX512VL + AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vscalefps
__m128 _mm_scalef_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_scalef_ps (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vscalefps
CPUID Flags: AVX512VL + AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 3 i := j*32 dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:128] := 0
vscalefps
__m256 _mm256_mask_scalef_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_scalef_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vscalefps
CPUID Flags: AVX512VL + AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vscalefps
__m256 _mm256_maskz_scalef_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_scalef_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vscalefps
CPUID Flags: AVX512VL + AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vscalefps
__m256 _mm256_scalef_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_scalef_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vscalefps
CPUID Flags: AVX512VL + AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 7 i := j*32 dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:256] := 0
vscalefps
__m512 _mm512_mask_scalef_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_scalef_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vscalefps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vscalefps
__m512 _mm512_maskz_scalef_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_scalef_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vscalefps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vscalefps
__m512 _mm512_scalef_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_scalef_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vscalefps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:512] := 0
vscalefpd
__m512d _mm512_mask_scalef_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_mask_scalef_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vscalefpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vscalefpd
__m512d _mm512_maskz_scalef_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_maskz_scalef_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vscalefpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vscalefpd
__m512d _mm512_scalef_round_pd (__m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_scalef_round_pd (__m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vscalefpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ENDFOR dst[MAX:512] := 0
vscalefps
__m512 _mm512_mask_scalef_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_mask_scalef_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vscalefps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vscalefps
__m512 _mm512_maskz_scalef_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_maskz_scalef_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vscalefps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vscalefps
__m512 _mm512_scalef_round_ps (__m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_scalef_round_ps (__m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vscalefps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:512] := 0
vscalefsd
__m128d _mm_mask_scalef_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_mask_scalef_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vscalefsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } IF k[0] dst[63:0] := SCALE(a[63:0], b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vscalefsd
__m128d _mm_maskz_scalef_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_maskz_scalef_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vscalefsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } IF k[0] dst[63:0] := SCALE(a[63:0], b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vscalefsd
__m128d _mm_scalef_round_sd (__m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_scalef_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vscalefsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } dst[63:0] := SCALE(a[63:0], b[63:0]) dst[127:64] := b[127:64] dst[MAX:128] := 0
vscalefss
__m128 _mm_mask_scalef_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_mask_scalef_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vscalefss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[63:0] } IF k[0] dst[31:0] := SCALE(a[31:0], b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vscalefss
__m128 _mm_maskz_scalef_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_maskz_scalef_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vscalefss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[63:0] } IF k[0] dst[31:0] := SCALE(a[31:0], b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vscalefss
__m128 _mm_scalef_round_ss (__m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_scalef_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vscalefss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from b to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[63:0] } dst[31:0] := SCALE(a[31:0], b[31:0]) dst[127:32] := b[127:32] dst[MAX:128] := 0
vscalefsd
__m128d _mm_mask_scalef_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_scalef_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vscalefsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } IF k[0] dst[63:0] := SCALE(a[63:0], b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vscalefsd
__m128d _mm_maskz_scalef_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_scalef_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vscalefsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } IF k[0] dst[63:0] := SCALE(a[63:0], b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vscalefsd
__m128d _mm_scalef_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_scalef_sd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vscalefsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst.

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } dst[63:0] := SCALE(a[63:0], b[63:0]) dst[127:64] := b[127:64] dst[MAX:128] := 0
vscalefss
__m128 _mm_mask_scalef_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_scalef_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vscalefss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[63:0] } IF k[0] dst[31:0] := SCALE(a[31:0], b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vscalefss
__m128 _mm_maskz_scalef_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_scalef_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vscalefss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[63:0] } IF k[0] dst[31:0] := SCALE(a[31:0], b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vscalefss
__m128 _mm_scalef_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_scalef_ss (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vscalefss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from b to the upper elements of dst.

Operation

SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[63:0] } dst[31:0] := SCALE(a[31:0], b[31:0]) dst[127:32] := b[127:32] dst[MAX:128] := 0
...
__m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)

Synopsis

__m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Set packed 16-bit integers in dst with the supplied values.

Operation

dst[15:0] := e0 dst[31:16] := e1 dst[47:32] := e2 dst[63:48] := e3 dst[79:64] := e4 dst[95:80] := e5 dst[111:96] := e6 dst[127:112] := e7
...
__m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)

Synopsis

__m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
#include "immintrin.h"
CPUID Flags: AVX

Description

Set packed 16-bit integers in dst with the supplied values.

Operation

dst[15:0] := e0 dst[31:16] := e1 dst[47:32] := e2 dst[63:48] := e3 dst[79:64] := e4 dst[95:80] := e5 dst[111:96] := e6 dst[127:112] := e7 dst[145:128] := e8 dst[159:144] := e9 dst[175:160] := e10 dst[191:176] := e11 dst[207:192] := e12 dst[223:208] := e13 dst[239:224] := e14 dst[255:240] := e15 dst[MAX:256] := 0
...
__m128i _mm_set_epi32 (int e3, int e2, int e1, int e0)

Synopsis

__m128i _mm_set_epi32 (int e3, int e2, int e1, int e0)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Set packed 32-bit integers in dst with the supplied values.

Operation

dst[31:0] := e0 dst[63:32] := e1 dst[95:64] := e2 dst[127:96] := e3
...
__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)

Synopsis

__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
#include "immintrin.h"
CPUID Flags: AVX

Description

Set packed 32-bit integers in dst with the supplied values.

Operation

dst[31:0] := e0 dst[63:32] := e1 dst[95:64] := e2 dst[127:96] := e3 dst[159:128] := e4 dst[191:160] := e5 dst[223:192] := e6 dst[255:224] := e7 dst[MAX:256] := 0
...
__m512i _mm512_set_epi32 (int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)

Synopsis

__m512i _mm512_set_epi32 (int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed 32-bit integers in dst with the supplied values.

Operation

dst[31:0] := e0 dst[63:32] := e1 dst[95:64] := e2 dst[127:96] := e3 dst[159:128] := e4 dst[191:160] := e5 dst[223:192] := e6 dst[255:224] := e7 dst[287:256] := e8 dst[319:288] := e9 dst[351:320] := e10 dst[383:352] := e11 dst[415:384] := e12 dst[447:416] := e13 dst[479:448] := e14 dst[511:480] := e15 dst[MAX:512] := 0
...
__m128i _mm_set_epi64 (__m64 e1, __m64 e0)

Synopsis

__m128i _mm_set_epi64 (__m64 e1, __m64 e0)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Set packed 64-bit integers in dst with the supplied values.

Operation

dst[63:0] := e0 dst[127:64] := e1
...
__m512i _mm512_set_epi64 (__int64 e7, __int64 e6, __int64 e5, __int64 e4, __int64 e3, __int64 e2, __int64 e1, __int64 e0)

Synopsis

__m512i _mm512_set_epi64 (__int64 e7, __int64 e6, __int64 e5, __int64 e4, __int64 e3, __int64 e2, __int64 e1, __int64 e0)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed 64-bit integers in dst with the supplied values.

Operation

dst[63:0] := e0 dst[127:64] := e1 dst[191:128] := e2 dst[255:192] := e3 dst[319:256] := e4 dst[383:320] := e5 dst[447:384] := e6 dst[511:448] := e7 dst[MAX:512] := 0
...
__m128i _mm_set_epi64x (__int64 e1, __int64 e0)

Synopsis

__m128i _mm_set_epi64x (__int64 e1, __int64 e0)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Set packed 64-bit integers in dst with the supplied values.

Operation

dst[63:0] := e0 dst[127:64] := e1
...
__m256i _mm256_set_epi64x (__int64 e3, __int64 e2, __int64 e1, __int64 e0)

Synopsis

__m256i _mm256_set_epi64x (__int64 e3, __int64 e2, __int64 e1, __int64 e0)
#include "immintrin.h"
CPUID Flags: AVX

Description

Set packed 64-bit integers in dst with the supplied values.

Operation

dst[63:0] := e0 dst[127:64] := e1 dst[191:128] := e2 dst[255:192] := e3 dst[MAX:256] := 0
...
__m128i _mm_set_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)

Synopsis

__m128i _mm_set_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Set packed 8-bit integers in dst with the supplied values in reverse order.

Operation

dst[7:0] := e0 dst[15:8] := e1 dst[23:16] := e2 dst[31:24] := e3 dst[39:32] := e4 dst[47:40] := e5 dst[55:48] := e6 dst[63:56] := e7 dst[71:64] := e8 dst[79:72] := e9 dst[87:80] := e10 dst[95:88] := e11 dst[103:96] := e12 dst[111:104] := e13 dst[119:112] := e14 dst[127:120] := e15
...
__m256i _mm256_set_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)

Synopsis

__m256i _mm256_set_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
#include "immintrin.h"
CPUID Flags: AVX

Description

Set packed 8-bit integers in dst with the supplied values in reverse order.

Operation

dst[7:0] := e0 dst[15:8] := e1 dst[23:16] := e2 dst[31:24] := e3 dst[39:32] := e4 dst[47:40] := e5 dst[55:48] := e6 dst[63:56] := e7 dst[71:64] := e8 dst[79:72] := e9 dst[87:80] := e10 dst[95:88] := e11 dst[103:96] := e12 dst[111:104] := e13 dst[119:112] := e14 dst[127:120] := e15 dst[135:128] := e16 dst[143:136] := e17 dst[151:144] := e18 dst[159:152] := e19 dst[167:160] := e20 dst[175:168] := e21 dst[183:176] := e22 dst[191:184] := e23 dst[199:192] := e24 dst[207:200] := e25 dst[215:208] := e26 dst[223:216] := e27 dst[231:224] := e28 dst[239:232] := e29 dst[247:240] := e30 dst[255:248] := e31 dst[MAX:256] := 0
void _MM_SET_EXCEPTION_MASK (unsigned int a)

Synopsis

void _MM_SET_EXCEPTION_MASK (unsigned int a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Macro: Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer a. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT

Operation

MXCSR := a[31:0] AND ~_MM_MASK_MASK
void _MM_SET_EXCEPTION_STATE (unsigned int a)

Synopsis

void _MM_SET_EXCEPTION_STATE (unsigned int a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Macro: Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer a. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT

Operation

MXCSR := a[31:0] AND ~_MM_EXCEPT_MASK
void _MM_SET_FLUSH_ZERO_MODE (unsigned int a)

Synopsis

void _MM_SET_FLUSH_ZERO_MODE (unsigned int a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Macro: Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer a. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF

Operation

MXCSR := a[31:0] AND ~_MM_FLUSH_MASK
vinsertf128
__m256 _mm256_set_m128 (__m128 hi, __m128 lo)

Synopsis

__m256 _mm256_set_m128 (__m128 hi, __m128 lo)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Set packed __m256 vector dst with the supplied values.

Operation

dst[127:0] := lo[127:0] dst[255:128] := hi[127:0] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vinsertf128
__m256d _mm256_set_m128d (__m128d hi, __m128d lo)

Synopsis

__m256d _mm256_set_m128d (__m128d hi, __m128d lo)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Set packed __m256d vector dst with the supplied values.

Operation

dst[127:0] := lo[127:0] dst[255:128] := hi[127:0] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vinsertf128
__m256i _mm256_set_m128i (__m128i hi, __m128i lo)

Synopsis

__m256i _mm256_set_m128i (__m128i hi, __m128i lo)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Set packed __m256i vector dst with the supplied values.

Operation

dst[127:0] := lo[127:0] dst[255:128] := hi[127:0] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
...
__m128d _mm_set_pd (double e1, double e0)

Synopsis

__m128d _mm_set_pd (double e1, double e0)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.

Operation

dst[63:0] := e0 dst[127:64] := e1
...
__m256d _mm256_set_pd (double e3, double e2, double e1, double e0)

Synopsis

__m256d _mm256_set_pd (double e3, double e2, double e1, double e0)
#include "immintrin.h"
CPUID Flags: AVX

Description

Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.

Operation

dst[63:0] := e0 dst[127:64] := e1 dst[191:128] := e2 dst[255:192] := e3 dst[MAX:256] := 0
...
__m512d _mm512_set_pd (double e7, double e6, double e5, double e4, double e3, double e2, double e1, double e0)

Synopsis

__m512d _mm512_set_pd (double e7, double e6, double e5, double e4, double e3, double e2, double e1, double e0)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.

Operation

dst[63:0] := e0 dst[127:64] := e1 dst[191:128] := e2 dst[255:192] := e3 dst[319:256] := e4 dst[383:320] := e5 dst[447:384] := e6 dst[511:448] := e7 dst[MAX:512] := 0
...
__m128d _mm_set_pd1 (double a)

Synopsis

__m128d _mm_set_pd1 (double a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Broadcast double-precision (64-bit) floating-point value a to all elements of dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[63:0] ENDFOR
...
__m128 _mm_set_ps (float e3, float e2, float e1, float e0)

Synopsis

__m128 _mm_set_ps (float e3, float e2, float e1, float e0)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Set packed single-precision (32-bit) floating-point elements in dst with the supplied values.

Operation

dst[31:0] := e0 dst[63:32] := e1 dst[95:64] := e2 dst[127:96] := e3
...
__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)

Synopsis

__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
#include "immintrin.h"
CPUID Flags: AVX

Description

Set packed single-precision (32-bit) floating-point elements in dst with the supplied values.

Operation

dst[31:0] := e0 dst[63:32] := e1 dst[95:64] := e2 dst[127:96] := e3 dst[159:128] := e4 dst[191:160] := e5 dst[223:192] := e6 dst[255:224] := e7 dst[MAX:256] := 0
...
__m512 _mm512_set_ps (float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)

Synopsis

__m512 _mm512_set_ps (float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed single-precision (32-bit) floating-point elements in dst with the supplied values.

Operation

dst[31:0] := e0 dst[63:32] := e1 dst[95:64] := e2 dst[127:96] := e3 dst[159:128] := e4 dst[191:160] := e5 dst[223:192] := e6 dst[255:224] := e7 dst[287:256] := e8 dst[319:288] := e9 dst[351:320] := e10 dst[383:352] := e11 dst[415:384] := e12 dst[447:416] := e13 dst[479:448] := e14 dst[511:480] := e15 dst[MAX:512] := 0
...
__m128 _mm_set_ps1 (float a)

Synopsis

__m128 _mm_set_ps1 (float a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Broadcast single-precision (32-bit) floating-point value a to all elements of dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := a[31:0] ENDFOR
void _MM_SET_ROUNDING_MODE (unsigned int a)

Synopsis

void _MM_SET_ROUNDING_MODE (unsigned int a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Macro: Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer a. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO

Operation

MXCSR := a[31:0] AND ~_MM_ROUND_MASK
...
__m128d _mm_set_sd (double a)

Synopsis

__m128d _mm_set_sd (double a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Copy double-precision (64-bit) floating-point element a to the lower element of dst, and zero the upper element.

Operation

dst[63:0] := a[63:0] dst[127:64] := 0
...
__m128 _mm_set_ss (float a)

Synopsis

__m128 _mm_set_ss (float a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Copy single-precision (32-bit) floating-point element a to the lower element of dst, and zero the upper 3 elements.

Operation

dst[31:0] := a[31:0] dst[127:32] := 0
vpbroadcastw
__m128i _mm_mask_set1_epi16 (__m128i src, __mmask8 k, short a)

Synopsis

__m128i _mm_mask_set1_epi16 (__m128i src, __mmask8 k, short a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpbroadcastw
__m128i _mm_maskz_set1_epi16 (__mmask8 k, short a)

Synopsis

__m128i _mm_maskz_set1_epi16 (__mmask8 k, short a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
...
__m128i _mm_set1_epi16 (short a)

Synopsis

__m128i _mm_set1_epi16 (short a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Broadcast 16-bit integer a to all all elements of dst. This intrinsic may generate vpbroadcastw.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := a[15:0] ENDFOR
vpbroadcastw
__m256i _mm256_mask_set1_epi16 (__m256i src, __mmask16 k, short a)

Synopsis

__m256i _mm256_mask_set1_epi16 (__m256i src, __mmask16 k, short a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpbroadcastw
__m256i _mm256_maskz_set1_epi16 (__mmask16 k, short a)

Synopsis

__m256i _mm256_maskz_set1_epi16 (__mmask16 k, short a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast 16-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
...
__m256i _mm256_set1_epi16 (short a)

Synopsis

__m256i _mm256_set1_epi16 (short a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Broadcast 16-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := a[15:0] ENDFOR dst[MAX:256] := 0
vpbroadcastw
__m512i _mm512_mask_set1_epi16 (__m512i src, __mmask32 k, short a)

Synopsis

__m512i _mm512_mask_set1_epi16 (__m512i src, __mmask32 k, short a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512BW

Description

Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpbroadcastw
__m512i _mm512_maskz_set1_epi16 (__mmask32 k, short a)

Synopsis

__m512i _mm512_maskz_set1_epi16 (__mmask32 k, short a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512BW

Description

Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
...
__m512i _mm512_set1_epi16 (short a)

Synopsis

__m512i _mm512_set1_epi16 (short a)
#include "immintrin.h"
Instruction: vpbroadcastw ymm, xmm
CPUID Flags: AVX512F

Description

Broadcast the low packed 16-bit integer from a to all all elements of dst.

Operation

FOR j := 0 to 31 i := j*16 dst[i+15:i] := a[15:0] ENDFOR dst[MAX:512] := 0
vpbroadcastd
__m128i _mm_mask_set1_epi32 (__m128i src, __mmask8 k, int a)

Synopsis

__m128i _mm_mask_set1_epi32 (__m128i src, __mmask8 k, int a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpbroadcastd
__m128i _mm_maskz_set1_epi32 (__mmask8 k, int a)

Synopsis

__m128i _mm_maskz_set1_epi32 (__mmask8 k, int a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
...
__m128i _mm_set1_epi32 (int a)

Synopsis

__m128i _mm_set1_epi32 (int a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Broadcast 32-bit integer a to all elements of dst. This intrinsic may generate vpbroadcastd.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := a[31:0] ENDFOR
vpbroadcastd
__m256i _mm256_mask_set1_epi32 (__m256i src, __mmask8 k, int a)

Synopsis

__m256i _mm256_mask_set1_epi32 (__m256i src, __mmask8 k, int a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpbroadcastd
__m256i _mm256_maskz_set1_epi32 (__mmask8 k, int a)

Synopsis

__m256i _mm256_maskz_set1_epi32 (__mmask8 k, int a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
...
__m256i _mm256_set1_epi32 (int a)

Synopsis

__m256i _mm256_set1_epi32 (int a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Broadcast 32-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastd.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:256] := 0
vpbroadcastd
__m512i _mm512_mask_set1_epi32 (__m512i src, __mmask16 k, int a)

Synopsis

__m512i _mm512_mask_set1_epi32 (__m512i src, __mmask16 k, int a)
#include "immintrin.h"
Instruction: vpbroadcastd zmm {k}, r32
CPUID Flags: AVX512F

Description

Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpbroadcastd
__m512i _mm512_maskz_set1_epi32 (__mmask16 k, int a)

Synopsis

__m512i _mm512_maskz_set1_epi32 (__mmask16 k, int a)
#include "immintrin.h"
Instruction: vpbroadcastd zmm {k}, r32
CPUID Flags: AVX512F

Description

Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpbroadcastd
__m512i _mm512_set1_epi32 (int a)

Synopsis

__m512i _mm512_set1_epi32 (int a)
#include "immintrin.h"
Instruction: vpbroadcastd zmm {k}, r32
CPUID Flags: AVX512F

Description

Broadcast 32-bit integer a to all elements of dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:512] := 0
vpbroadcastq
__m128i _mm_mask_set1_epi64 (__m128i src, __mmask8 k, __int64 a)

Synopsis

__m128i _mm_mask_set1_epi64 (__m128i src, __mmask8 k, __int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpbroadcastq
__m128i _mm_maskz_set1_epi64 (__mmask8 k, __int64 a)

Synopsis

__m128i _mm_maskz_set1_epi64 (__mmask8 k, __int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
...
__m128i _mm_set1_epi64 (__m64 a)

Synopsis

__m128i _mm_set1_epi64 (__m64 a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Broadcast 64-bit integer a to all elements of dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[63:0] ENDFOR
vpbroadcastq
__m256i _mm256_mask_set1_epi64 (__m256i src, __mmask8 k, __int64 a)

Synopsis

__m256i _mm256_mask_set1_epi64 (__m256i src, __mmask8 k, __int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpbroadcastq
__m256i _mm256_maskz_set1_epi64 (__mmask8 k, __int64 a)

Synopsis

__m256i _mm256_maskz_set1_epi64 (__mmask8 k, __int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F

Description

Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpbroadcastq
__m512i _mm512_mask_set1_epi64 (__m512i src, __mmask8 k, __int64 a)

Synopsis

__m512i _mm512_mask_set1_epi64 (__m512i src, __mmask8 k, __int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq zmm {k}, r64
CPUID Flags: AVX512F

Description

Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpbroadcastq
__m512i _mm512_maskz_set1_epi64 (__mmask8 k, __int64 a)

Synopsis

__m512i _mm512_maskz_set1_epi64 (__mmask8 k, __int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq zmm {k}, r64
CPUID Flags: AVX512F

Description

Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpbroadcastq
__m512i _mm512_set1_epi64 (__int64 a)

Synopsis

__m512i _mm512_set1_epi64 (__int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq zmm {k}, r64
CPUID Flags: AVX512F

Description

Broadcast 64-bit integer a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:512] := 0
...
__m128i _mm_set1_epi64x (__int64 a)

Synopsis

__m128i _mm_set1_epi64x (__int64 a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Broadcast 64-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastq.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[63:0] ENDFOR
...
__m256i _mm256_set1_epi64x (long long a)

Synopsis

__m256i _mm256_set1_epi64x (long long a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Broadcast 64-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastq.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:256] := 0
vpbroadcastb
__m128i _mm_mask_set1_epi8 (__m128i src, __mmask16 k, char a)

Synopsis

__m128i _mm_mask_set1_epi8 (__m128i src, __mmask16 k, char a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := a[7:0] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpbroadcastb
__m128i _mm_maskz_set1_epi8 (__mmask16 k, char a)

Synopsis

__m128i _mm_maskz_set1_epi8 (__mmask16 k, char a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := a[7:0] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
...
__m128i _mm_set1_epi8 (char a)

Synopsis

__m128i _mm_set1_epi8 (char a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Broadcast 8-bit integer a to all elements of dst. This intrinsic may generate vpbroadcastb.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := a[7:0] ENDFOR
vpbroadcastb
__m256i _mm256_mask_set1_epi8 (__m256i src, __mmask32 k, char a)

Synopsis

__m256i _mm256_mask_set1_epi8 (__m256i src, __mmask32 k, char a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := a[7:0] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpbroadcastb
__m256i _mm256_maskz_set1_epi8 (__mmask32 k, char a)

Synopsis

__m256i _mm256_maskz_set1_epi8 (__mmask32 k, char a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW

Description

Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := a[7:0] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
...
__m256i _mm256_set1_epi8 (char a)

Synopsis

__m256i _mm256_set1_epi8 (char a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Broadcast 8-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastb.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := a[7:0] ENDFOR dst[MAX:256] := 0
vpbroadcastb
__m512i _mm512_mask_set1_epi8 (__m512i src, __mmask64 k, char a)

Synopsis

__m512i _mm512_mask_set1_epi8 (__m512i src, __mmask64 k, char a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512BW

Description

Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := a[7:0] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpbroadcastb
__m512i _mm512_maskz_set1_epi8 (__mmask64 k, char a)

Synopsis

__m512i _mm512_maskz_set1_epi8 (__mmask64 k, char a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512BW

Description

Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := a[7:0] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
...
__m512i _mm512_set1_epi8 (char a)

Synopsis

__m512i _mm512_set1_epi8 (char a)
#include "immintrin.h"
Instruction: vpbroadcastb ymm, xmm
CPUID Flags: AVX512F

Description

Broadcast 8-bit integer a to all elements of dst.

Operation

FOR j := 0 to 63 i := j*8 dst[i+7:i] := a[7:0] ENDFOR dst[MAX:512] := 0
...
__m128d _mm_set1_pd (double a)

Synopsis

__m128d _mm_set1_pd (double a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Broadcast double-precision (64-bit) floating-point value a to all elements of dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[63:0] ENDFOR
...
__m256d _mm256_set1_pd (double a)

Synopsis

__m256d _mm256_set1_pd (double a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Broadcast double-precision (64-bit) floating-point value a to all elements of dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_set1_pd (double a)

Synopsis

__m512d _mm512_set1_pd (double a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Broadcast double-precision (64-bit) floating-point value a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:512] := 0
...
__m128 _mm_set1_ps (float a)

Synopsis

__m128 _mm_set1_ps (float a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Broadcast single-precision (32-bit) floating-point value a to all elements of dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := a[31:0] ENDFOR
...
__m256 _mm256_set1_ps (float a)

Synopsis

__m256 _mm256_set1_ps (float a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Broadcast single-precision (32-bit) floating-point value a to all elements of dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_set1_ps (float a)

Synopsis

__m512 _mm512_set1_ps (float a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Broadcast single-precision (32-bit) floating-point value a to all elements of dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:512] := 0
...
__m512i _mm512_set4_epi32 (int d, int c, int b, int a)

Synopsis

__m512i _mm512_set4_epi32 (int d, int c, int b, int a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed 32-bit integers in dst with the repeated 4 element sequence.

Operation

dst[31:0] := d dst[63:32] := c dst[95:64] := b dst[127:96] := a dst[159:128] := d dst[191:160] := c dst[223:192] := b dst[255:224] := a dst[287:256] := d dst[319:288] := c dst[351:320] := b dst[383:352] := a dst[415:384] := d dst[447:416] := c dst[479:448] := b dst[511:480] := a dst[MAX:512] := 0
...
__m512i _mm512_set4_epi64 (__int64 d, __int64 c, __int64 b, __int64 a)

Synopsis

__m512i _mm512_set4_epi64 (__int64 d, __int64 c, __int64 b, __int64 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed 64-bit integers in dst with the repeated 4 element sequence.

Operation

dst[63:0] := d dst[127:64] := c dst[191:128] := b dst[255:192] := a dst[319:256] := d dst[383:320] := c dst[447:384] := b dst[511:448] := a dst[MAX:512] := 0
...
__m512d _mm512_set4_pd (double d, double c, double b, double a)

Synopsis

__m512d _mm512_set4_pd (double d, double c, double b, double a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence.

Operation

dst[63:0] := d dst[127:64] := c dst[191:128] := b dst[255:192] := a dst[319:256] := d dst[383:320] := c dst[447:384] := b dst[511:448] := a dst[MAX:512] := 0
...
__m512 _mm512_set4_ps (float d, float c, float b, float a)

Synopsis

__m512 _mm512_set4_ps (float d, float c, float b, float a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence.

Operation

dst[31:0] := d dst[63:32] := c dst[95:64] := b dst[127:96] := a dst[159:128] := d dst[191:160] := c dst[223:192] := b dst[255:224] := a dst[287:256] := d dst[319:288] := c dst[351:320] := b dst[383:352] := a dst[415:384] := d dst[447:416] := c dst[479:448] := b dst[511:480] := a dst[MAX:512] := 0
ldmxcsr
void _mm_setcsr (unsigned int a)

Synopsis

void _mm_setcsr (unsigned int a)
#include "xmmintrin.h"
Instruction: ldmxcsr MEMd
CPUID Flags: SSE

Description

Set the MXCSR control and status register with the value in unsigned 32-bit integer a.

Operation

MXCSR := a[31:0]

Performance

ArchitectureLatencyThroughput
Ivy Bridge2-
Sandy Bridge2-
...
__m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)

Synopsis

__m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Set packed 16-bit integers in dst with the supplied values in reverse order.

Operation

dst[15:0] := e7 dst[31:16] := e6 dst[47:32] := e5 dst[63:48] := e4 dst[79:64] := e3 dst[95:80] := e2 dst[111:96] := e1 dst[127:112] := e0
...
__m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)

Synopsis

__m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
#include "immintrin.h"
CPUID Flags: AVX

Description

Set packed 16-bit integers in dst with the supplied values in reverse order.

Operation

dst[15:0] := e15 dst[31:16] := e14 dst[47:32] := e13 dst[63:48] := e12 dst[79:64] := e11 dst[95:80] := e10 dst[111:96] := e9 dst[127:112] := e8 dst[145:128] := e7 dst[159:144] := e6 dst[175:160] := e5 dst[191:176] := e4 dst[207:192] := e3 dst[223:208] := e2 dst[239:224] := e1 dst[255:240] := e0 dst[MAX:256] := 0
...
__m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0)

Synopsis

__m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Set packed 32-bit integers in dst with the supplied values in reverse order.

Operation

dst[31:0] := e3 dst[63:32] := e2 dst[95:64] := e1 dst[127:96] := e0
...
__m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)

Synopsis

__m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
#include "immintrin.h"
CPUID Flags: AVX

Description

Set packed 32-bit integers in dst with the supplied values in reverse order.

Operation

dst[31:0] := e7 dst[63:32] := e6 dst[95:64] := e5 dst[127:96] := e4 dst[159:128] := e3 dst[191:160] := e2 dst[223:192] := e1 dst[255:224] := e0 dst[MAX:256] := 0
...
__m512i _mm512_setr_epi32 (int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)

Synopsis

__m512i _mm512_setr_epi32 (int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed 32-bit integers in dst with the supplied values in reverse order.

Operation

dst[31:0] := e15 dst[63:32] := e14 dst[95:64] := e13 dst[127:96] := e12 dst[159:128] := e11 dst[191:160] := e10 dst[223:192] := e9 dst[255:224] := e8 dst[287:256] := e7 dst[319:288] := e6 dst[351:320] := e5 dst[383:352] := e4 dst[415:384] := e3 dst[447:416] := e2 dst[479:448] := e1 dst[511:480] := e0 dst[MAX:512] := 0
...
__m128i _mm_setr_epi64 (__m64 e1, __m64 e0)

Synopsis

__m128i _mm_setr_epi64 (__m64 e1, __m64 e0)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Set packed 64-bit integers in dst with the supplied values in reverse order.

Operation

dst[63:0] := e1 dst[127:64] := e0
...
__m512i _mm512_setr_epi64 (__int64 e7, __int64 e6, __int64 e5, __int64 e4, __int64 e3, __int64 e2, __int64 e1, __int64 e0)

Synopsis

__m512i _mm512_setr_epi64 (__int64 e7, __int64 e6, __int64 e5, __int64 e4, __int64 e3, __int64 e2, __int64 e1, __int64 e0)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed 64-bit integers in dst with the supplied values in reverse order.

Operation

dst[63:0] := e7 dst[127:64] := e6 dst[191:128] := e5 dst[255:192] := e4 dst[319:256] := e3 dst[383:320] := e2 dst[447:384] := e1 dst[511:448] := e0 dst[MAX:512] := 0
...
__m256i _mm256_setr_epi64x (__int64 e3, __int64 e2, __int64 e1, __int64 e0)

Synopsis

__m256i _mm256_setr_epi64x (__int64 e3, __int64 e2, __int64 e1, __int64 e0)
#include "immintrin.h"
CPUID Flags: AVX

Description

Set packed 64-bit integers in dst with the supplied values in reverse order.

Operation

dst[63:0] := e3 dst[127:64] := e2 dst[191:128] := e1 dst[255:192] := e0 dst[MAX:256] := 0
...
__m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)

Synopsis

__m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Set packed 8-bit integers in dst with the supplied values in reverse order.

Operation

dst[7:0] := e15 dst[15:8] := e14 dst[23:16] := e13 dst[31:24] := e12 dst[39:32] := e11 dst[47:40] := e10 dst[55:48] := e9 dst[63:56] := e8 dst[71:64] := e7 dst[79:72] := e6 dst[87:80] := e5 dst[95:88] := e4 dst[103:96] := e3 dst[111:104] := e2 dst[119:112] := e1 dst[127:120] := e0
...
__m256i _mm256_setr_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)

Synopsis

__m256i _mm256_setr_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
#include "immintrin.h"
CPUID Flags: AVX

Description

Set packed 8-bit integers in dst with the supplied values in reverse order.

Operation

dst[7:0] := e31 dst[15:8] := e30 dst[23:16] := e29 dst[31:24] := e28 dst[39:32] := e27 dst[47:40] := e26 dst[55:48] := e25 dst[63:56] := e24 dst[71:64] := e23 dst[79:72] := e22 dst[87:80] := e21 dst[95:88] := e20 dst[103:96] := e19 dst[111:104] := e18 dst[119:112] := e17 dst[127:120] := e16 dst[135:128] := e15 dst[143:136] := e14 dst[151:144] := e13 dst[159:152] := e12 dst[167:160] := e11 dst[175:168] := e10 dst[183:176] := e9 dst[191:184] := e8 dst[199:192] := e7 dst[207:200] := e6 dst[215:208] := e5 dst[223:216] := e4 dst[231:224] := e3 dst[239:232] := e2 dst[247:240] := e1 dst[255:248] := e0 dst[MAX:256] := 0
vinsertf128
__m256 _mm256_setr_m128 (__m128 lo, __m128 hi)

Synopsis

__m256 _mm256_setr_m128 (__m128 lo, __m128 hi)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Set packed __m256 vector dst with the supplied values.

Operation

dst[127:0] := lo[127:0] dst[255:128] := hi[127:0] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vinsertf128
__m256d _mm256_setr_m128d (__m128d lo, __m128d hi)

Synopsis

__m256d _mm256_setr_m128d (__m128d lo, __m128d hi)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Set packed __m256d vector dst with the supplied values.

Operation

dst[127:0] := lo[127:0] dst[255:128] := hi[127:0] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vinsertf128
__m256i _mm256_setr_m128i (__m128i lo, __m128i hi)

Synopsis

__m256i _mm256_setr_m128i (__m128i lo, __m128i hi)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX

Description

Set packed __m256i vector dst with the supplied values.

Operation

dst[127:0] := lo[127:0] dst[255:128] := hi[127:0] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
...
__m128d _mm_setr_pd (double e1, double e0)

Synopsis

__m128d _mm_setr_pd (double e1, double e0)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.

Operation

dst[63:0] := e1 dst[127:64] := e0
...
__m256d _mm256_setr_pd (double e3, double e2, double e1, double e0)

Synopsis

__m256d _mm256_setr_pd (double e3, double e2, double e1, double e0)
#include "immintrin.h"
CPUID Flags: AVX

Description

Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.

Operation

dst[63:0] := e3 dst[127:64] := e2 dst[191:128] := e1 dst[255:192] := e0 dst[MAX:256] := 0
...
__m512d _mm512_setr_pd (double e7, double e6, double e5, double e4, double e3, double e2, double e1, double e0)

Synopsis

__m512d _mm512_setr_pd (double e7, double e6, double e5, double e4, double e3, double e2, double e1, double e0)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.

Operation

dst[63:0] := e7 dst[127:64] := e6 dst[191:128] := e5 dst[255:192] := e4 dst[319:256] := e3 dst[383:320] := e2 dst[447:384] := e1 dst[511:448] := e0 dst[MAX:512] := 0
...
__m128 _mm_setr_ps (float e3, float e2, float e1, float e0)

Synopsis

__m128 _mm_setr_ps (float e3, float e2, float e1, float e0)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Set packed single-precision (32-bit) floating-point elements in dst with the supplied values in reverse order.

Operation

dst[31:0] := e3 dst[63:32] := e2 dst[95:64] := e1 dst[127:96] := e0
...
__m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)

Synopsis

__m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
#include "immintrin.h"
CPUID Flags: AVX

Description

Set packed single-precision (32-bit) floating-point elements in dst with the supplied values in reverse order.

Operation

dst[31:0] := e7 dst[63:32] := e6 dst[95:64] := e5 dst[127:96] := e4 dst[159:128] := e3 dst[191:160] := e2 dst[223:192] := e1 dst[255:224] := e0 dst[MAX:256] := 0
...
__m512 _mm512_setr_ps (float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)

Synopsis

__m512 _mm512_setr_ps (float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed single-precision (32-bit) floating-point elements in dst with the supplied values in reverse order.

Operation

dst[31:0] := e15 dst[63:32] := e14 dst[95:64] := e13 dst[127:96] := e12 dst[159:128] := e11 dst[191:160] := e10 dst[223:192] := e9 dst[255:224] := e8 dst[287:256] := e7 dst[319:288] := e6 dst[351:320] := e5 dst[383:352] := e4 dst[415:384] := e3 dst[447:416] := e2 dst[479:448] := e1 dst[511:480] := e0 dst[MAX:512] := 0
...
__m512i _mm512_setr4_epi32 (int d, int c, int b, int a)

Synopsis

__m512i _mm512_setr4_epi32 (int d, int c, int b, int a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed 32-bit integers in dst with the repeated 4 element sequence in reverse order.

Operation

dst[31:0] := a dst[63:32] := b dst[95:64] := c dst[127:96] := d dst[159:128] := a dst[191:160] := b dst[223:192] := c dst[255:224] := d dst[287:256] := a dst[319:288] := b dst[351:320] := c dst[383:352] := d dst[415:384] := a dst[447:416] := b dst[479:448] := c dst[511:480] := d dst[MAX:512] := 0
...
__m512i _mm512_setr4_epi64 (__int64 d, __int64 c, __int64 b, __int64 a)

Synopsis

__m512i _mm512_setr4_epi64 (__int64 d, __int64 c, __int64 b, __int64 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed 64-bit integers in dst with the repeated 4 element sequence in reverse order.

Operation

dst[63:0] := a dst[127:64] := b dst[191:128] := c dst[255:192] := d dst[319:256] := a dst[383:320] := b dst[447:384] := c dst[511:448] := d dst[MAX:512] := 0
...
__m512d _mm512_setr4_pd (double d, double c, double b, double a)

Synopsis

__m512d _mm512_setr4_pd (double d, double c, double b, double a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.

Operation

dst[63:0] := a dst[127:64] := b dst[191:128] := c dst[255:192] := d dst[319:256] := a dst[383:320] := b dst[447:384] := c dst[511:448] := d dst[MAX:512] := 0
...
__m512 _mm512_setr4_ps (float d, float c, float b, float a)

Synopsis

__m512 _mm512_setr4_ps (float d, float c, float b, float a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.

Operation

dst[31:0] := a dst[63:32] := b dst[95:64] := c dst[127:96] := d dst[159:128] := a dst[191:160] := b dst[223:192] := c dst[255:224] := d dst[287:256] := a dst[319:288] := b dst[351:320] := c dst[383:352] := d dst[415:384] := a dst[447:416] := b dst[479:448] := c dst[511:480] := d dst[MAX:512] := 0
vpxorq
__m512 _mm512_setzero (void)

Synopsis

__m512 _mm512_setzero (void)
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Return vector of type __m512 with all elements set to zero.

Operation

dst[MAX:0] := 0
vpxorq
__m512i _mm512_setzero_epi32 ()

Synopsis

__m512i _mm512_setzero_epi32 ()
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Return vector of type __m512i with all elements set to zero.

Operation

dst[MAX:0] := 0
xorpd
__m128d _mm_setzero_pd (void)

Synopsis

__m128d _mm_setzero_pd (void)
#include "emmintrin.h"
Instruction: xorpd xmm, xmm
CPUID Flags: SSE2

Description

Return vector of type __m128d with all elements set to zero.

Operation

dst[MAX:0] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.8
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vxorpd
__m256d _mm256_setzero_pd (void)

Synopsis

__m256d _mm256_setzero_pd (void)
#include "immintrin.h"
Instruction: vxorpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Return vector of type __m256d with all elements set to zero.

Operation

dst[MAX:0] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vpxorq
__m512d _mm512_setzero_pd ()

Synopsis

__m512d _mm512_setzero_pd ()
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Return vector of type __m512d with all elements set to zero.

Operation

dst[MAX:0] := 0
xorps
__m128 _mm_setzero_ps (void)

Synopsis

__m128 _mm_setzero_ps (void)
#include "xmmintrin.h"
Instruction: xorps xmm, xmm
CPUID Flags: SSE

Description

Return vector of type __m128 with all elements set to zero.

Operation

dst[MAX:0] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vxorps
__m256 _mm256_setzero_ps (void)

Synopsis

__m256 _mm256_setzero_ps (void)
#include "immintrin.h"
Instruction: vxorps ymm, ymm, ymm
CPUID Flags: AVX

Description

Return vector of type __m256 with all elements set to zero.

Operation

dst[MAX:0] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vpxorq
__m512 _mm512_setzero_ps ()

Synopsis

__m512 _mm512_setzero_ps ()
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Return vector of type __m512 with all elements set to zero.

Operation

dst[MAX:0] := 0
pxor
__m128i _mm_setzero_si128 ()

Synopsis

__m128i _mm_setzero_si128 ()
#include "emmintrin.h"
Instruction: pxor xmm, xmm
CPUID Flags: SSE2

Description

Return vector of type __m128i with all elements set to zero.

Operation

dst[MAX:0] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vpxor
__m256i _mm256_setzero_si256 (void)

Synopsis

__m256i _mm256_setzero_si256 (void)
#include "immintrin.h"
Instruction: vpxor ymm, ymm, ymm
CPUID Flags: AVX

Description

Return vector of type __m256i with all elements set to zero.

Operation

dst[MAX:0] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpxorq
__m512i _mm512_setzero_si512 ()

Synopsis

__m512i _mm512_setzero_si512 ()
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Return vector of type __m512i with all elements set to zero.

Operation

dst[MAX:0] := 0
sfence
void _mm_sfence (void)

Synopsis

void _mm_sfence (void)
#include "xmmintrin.h"
Instruction: sfence
CPUID Flags: SSE

Description

Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order.
sha1msg1
__m128i _mm_sha1msg1_epu32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sha1msg1_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: sha1msg1 xmm, xmm
CPUID Flags: SHA

Description

Perform an intermediate calculation for the next four SHA1 message values (unsigned 32-bit integers) using previous message values from a and b, and store the result in dst.

Operation

W0 := a[127:96]; W1 := a[95:64]; W2 := a[63:32]; W3 := a[31:0]; W4 := b[127:96]; W5 := b[95:64]; dst[127:96] := W2 XOR W0; dst[95:64] := W3 XOR W1; dst[63:32] := W4 XOR W2; dst[31:0] := W5 XOR W3;
sha1msg2
__m128i _mm_sha1msg2_epu32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sha1msg2_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: sha1msg2 xmm, xmm
CPUID Flags: SHA

Description

Perform the final calculation for the next four SHA1 message values (unsigned 32-bit integers) using the intermediate result in a and the previous message values in b, and store the result in dst.

Operation

W13 := b[95:64]; W14 := b[63:32]; W15 := b[31:0]; W16 := (a[127:96] XOR W13) <<< 1; W17 := (a[95:64] XOR W14) <<< 1; W18 := (a[63:32] XOR W15) <<< 1; W19 := (a[31:0] XOR W16) <<< 1; dst[127:96] := W16; dst[95:64] := W17; dst[63:32] := W18; dst[31:0] := W19;
sha1nexte
__m128i _mm_sha1nexte_epu32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sha1nexte_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: sha1nexte xmm, xmm
CPUID Flags: SHA

Description

Calculate SHA1 state variable E after four rounds of operation from the current SHA1 state variable a, add that value to the scheduled values (unsigned 32-bit integers) in b, and store the result in dst.

Operation

tmp := (a[127:96] <<< 30); dst[127:96] := b[127:96] + tmp; dst[95:64] := b[95:64]; dst[63:32] := b[63:32]; dst[31:0] := b[31:0];
sha1rnds4
__m128i _mm_sha1rnds4_epu32 (__m128i a, __m128i b, const int func)

Synopsis

__m128i _mm_sha1rnds4_epu32 (__m128i a, __m128i b, const int func)
#include "immintrin.h"
Instruction: sha1rnds4 xmm, xmm, imm
CPUID Flags: SHA

Description

Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from a and some pre-computed sum of the next 4 round message values (unsigned 32-bit integers), and state variable E from b, and store the updated SHA1 state (A,B,C,D) in dst. func contains the logic functions and round constants.

Operation

IF (func[1:0] = 0) THEN f() := f0(), K := K0; ELSE IF (func[1:0] = 1) THEN f() := f1(), K := K1; ELSE IF (func[1:0] = 2) THEN f() := f2(), K := K2; ELSE IF (func[1:0] = 3) THEN f() := f3(), K := K3; FI; A := a[127:96]; B := a[95:64]; C := a[63:32]; D := a[31:0]; W[0] := b[127:96]; W[1] := b[95:64]; W[2] := b[63:32]; W[3] := b[31:0]; A[1] := f(B, C, D) + (A <<< 5) + W[0] + K; B[1] := A; C[1] := B <<< 30; D[1] := C; E[1] := D; FOR i = 1 to 3 A[i+1] := f(B[i], C[i], D[i]) + (A[i] <<< 5) + W[i] + E[i] + K; B[i+1] := A[i]; C[i+1] := B[i] <<< 30; D[i+1] := C[i]; E[i+1] := D[i]; ENDFOR; dst[127:96] := A[4]; dst[95:64] := B[4]; dst[63:32] := C[4]; dst[31:0] := D[4];
sha256msg1
__m128i _mm_sha256msg1_epu32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sha256msg1_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: sha256msg1 xmm, xmm
CPUID Flags: SHA

Description

Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from a and b, and store the result in dst.

Operation

W4 := b[31:0]; W3 := a[127:96]; W2 := a[95:64]; W1 := a[63:32]; W0 := a[31:0]; dst[127:96] := W3 + sigma0(W4); dst[95:64] := W2 + sigma0(W3); dst[63:32] := W1 + sigma0(W2); dst[31:0] := W0 + sigma0(W1);
sha256msg2
__m128i _mm_sha256msg2_epu32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sha256msg2_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: sha256msg2 xmm, xmm
CPUID Flags: SHA

Description

Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from a and b, and store the result in dst."

Operation

W14 := b[95:64]; W15 := b[127:96]; W16 := a[31:0] + sigma1(W14); W17 := a[63:32] + sigma1(W15); W18 := a[95:64] + sigma1(W16); W19 := a[127:96] + sigma1(W17); dst[127:96] := W19; dst[95:64] := W18; dst[63:32] := W17; dst[31:0] := W16;
sha256rnds2
__m128i _mm_sha256rnds2_epu32 (__m128i a, __m128i b, __m128i k)

Synopsis

__m128i _mm_sha256rnds2_epu32 (__m128i a, __m128i b, __m128i k)
#include "immintrin.h"
Instruction: sha256rnds2 xmm, xmm
CPUID Flags: SHA

Description

Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from a, an initial SHA256 state (A,B,E,F) from b, and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) and the corresponding round constants from k, and store the updated SHA256 state (A,B,E,F) in dst.

Operation

A[0] := b[127:96]; B[0] := b[95:64]; C[0] := a[127:96]; D[0] := a[95:64]; E[0] := b[63:32]; F[0] := b[31:0]; G[0] := a[63:32]; H[0] := a[31:0]; W_K0 := k[31:0]; W_K1 := k[63:32]; FOR i = 0 to 1 A_(i+1) := Ch(E[i], F[i], G[i]) + sum1(E[i]) + WKi + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i]); B_(i+1) := A[i]; C_(i+1) := B[i]; D_(i+1) := C[i]; E_(i+1) := Ch(E[i], F[i], G[i]) + sum1(E[i]) + WKi + H[i] + D[i]; F_(i+1) := E[i]; G_(i+1) := F[i]; H_(i+1) := G[i]; ENDFOR; dst[127:96] := A[2]; dst[95:64] := B[2]; dst[63:32] := E[2]; dst[31:0] := F[2];
vpshufd
__m128i _mm_mask_shuffle_epi32 (__m128i src, __mmask8 k, __m128i a, _MM_PERM_ENUM imm8)

Synopsis

__m128i _mm_mask_shuffle_epi32 (__m128i src, __mmask8 k, __m128i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpshufd
__m128i _mm_maskz_shuffle_epi32 (__mmask8 k, __m128i a, _MM_PERM_ENUM imm8)

Synopsis

__m128i _mm_maskz_shuffle_epi32 (__mmask8 k, __m128i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
pshufd
__m128i _mm_shuffle_epi32 (__m128i a, int imm8)

Synopsis

__m128i _mm_shuffle_epi32 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pshufd xmm, xmm, imm
CPUID Flags: SSE2

Description

Shuffle 32-bit integers in a using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], imm8[1:0]) dst[63:32] := SELECT4(a[127:0], imm8[3:2]) dst[95:64] := SELECT4(a[127:0], imm8[5:4]) dst[127:96] := SELECT4(a[127:0], imm8[7:6])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpshufd
__m256i _mm256_mask_shuffle_epi32 (__m256i src, __mmask8 k, __m256i a, _MM_PERM_ENUM imm8)

Synopsis

__m256i _mm256_mask_shuffle_epi32 (__m256i src, __mmask8 k, __m256i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpshufd
__m256i _mm256_maskz_shuffle_epi32 (__mmask8 k, __m256i a, _MM_PERM_ENUM imm8)

Synopsis

__m256i _mm256_maskz_shuffle_epi32 (__mmask8 k, __m256i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpshufd
__m256i _mm256_shuffle_epi32 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_shuffle_epi32 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpshufd ymm, ymm, imm
CPUID Flags: AVX2

Description

Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], imm8[1:0]) dst[63:32] := SELECT4(a[127:0], imm8[3:2]) dst[95:64] := SELECT4(a[127:0], imm8[5:4]) dst[127:96] := SELECT4(a[127:0], imm8[7:6]) dst[159:128] := SELECT4(a[255:128], imm8[1:0]) dst[191:160] := SELECT4(a[255:128], imm8[3:2]) dst[223:192] := SELECT4(a[255:128], imm8[5:4]) dst[255:224] := SELECT4(a[255:128], imm8[7:6]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpshufd
__m512i _mm512_mask_shuffle_epi32 (__m512i src, __mmask16 k, __m512i a, _MM_PERM_ENUM imm8)

Synopsis

__m512i _mm512_mask_shuffle_epi32 (__m512i src, __mmask16 k, __m512i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpshufd
__m512i _mm512_maskz_shuffle_epi32 (__mmask16 k, __m512i a, _MM_PERM_ENUM imm8)

Synopsis

__m512i _mm512_maskz_shuffle_epi32 (__mmask16 k, __m512i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpshufd
__m512i _mm512_shuffle_epi32 (__m512i a, _MM_PERM_ENUM imm8)

Synopsis

__m512i _mm512_shuffle_epi32 (__m512i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], imm8[1:0]) dst[63:32] := SELECT4(a[127:0], imm8[3:2]) dst[95:64] := SELECT4(a[127:0], imm8[5:4]) dst[127:96] := SELECT4(a[127:0], imm8[7:6]) dst[159:128] := SELECT4(a[255:128], imm8[1:0]) dst[191:160] := SELECT4(a[255:128], imm8[3:2]) dst[223:192] := SELECT4(a[255:128], imm8[5:4]) dst[255:224] := SELECT4(a[255:128], imm8[7:6]) dst[287:256] := SELECT4(a[383:256], imm8[1:0]) dst[319:288] := SELECT4(a[383:256], imm8[3:2]) dst[351:320] := SELECT4(a[383:256], imm8[5:4]) dst[383:352] := SELECT4(a[383:256], imm8[7:6]) dst[415:384] := SELECT4(a[511:384], imm8[1:0]) dst[447:416] := SELECT4(a[511:384], imm8[3:2]) dst[479:448] := SELECT4(a[511:384], imm8[5:4]) dst[511:480] := SELECT4(a[511:384], imm8[7:6]) dst[MAX:512] := 0
vpshufb
__m128i _mm_mask_shuffle_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_shuffle_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] IF b[i+7] == 1 dst[i+7:i] := 0 ELSE index[3:0] := b[i+3:i] dst[i+7:i] := a[index*8+7:index*8] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpshufb
__m128i _mm_maskz_shuffle_epi8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_shuffle_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] IF b[i+7] == 1 dst[i+7:i] := 0 ELSE index[3:0] := b[i+3:i] dst[i+7:i] := a[index*8+7:index*8] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
pshufb
__m128i _mm_shuffle_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_shuffle_epi8 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: pshufb mm, mm
CPUID Flags: SSSE3

Description

Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 IF b[i+7] == 1 dst[i+7:i] := 0 ELSE index[3:0] := b[i+3:i] dst[i+7:i] := a[index*8+7:index*8] FI ENDFOR
vpshufb
__m256i _mm256_mask_shuffle_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_shuffle_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] IF b[i+7] == 1 dst[i+7:i] := 0 ELSE index[3:0] := b[i+3:i] dst[i+7:i] := a[index*8+7:index*8] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpshufb
__m256i _mm256_maskz_shuffle_epi8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_shuffle_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] IF b[i+7] == 1 dst[i+7:i] := 0 ELSE index[3:0] := b[i+3:i] dst[i+7:i] := a[index*8+7:index*8] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpshufb
__m256i _mm256_shuffle_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_shuffle_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpshufb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Shuffle 8-bit integers in a within 128-bit lanes according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 IF b[i+7] == 1 dst[i+7:i] := 0 ELSE index[3:0] := b[i+3:i] dst[i+7:i] := a[index*8+7:index*8] FI IF b[128+i+7] == 1 dst[128+i+7:i] := 0 ELSE index[3:0] := b[128+i+3:128+i] dst[128+i+7:i] := a[128+index*8+7:128+index*8] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpshufb
__m512i _mm512_mask_shuffle_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_shuffle_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512BW

Description

Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] IF b[i+7] == 1 dst[i+7:i] := 0 ELSE index[3:0] := b[i+3:i] dst[i+7:i] := a[index*8+7:index*8] FI ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpshufb
__m512i _mm512_maskz_shuffle_epi8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_shuffle_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512BW

Description

Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] IF b[i+7] == 1 dst[i+7:i] := 0 ELSE index[3:0] := b[i+3:i] dst[i+7:i] := a[index*8+7:index*8] FI ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpshufb
__m512i _mm512_shuffle_epi8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_shuffle_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512BW

Description

Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.

Operation

FOR j := 0 to 63 i := j*8 IF b[i+7] == 1 dst[i+7:i] := 0 ELSE index[3:0] := b[i+3:i] dst[i+7:i] := a[index*8+7:index*8] FI ENDFOR dst[MAX:512] := 0
vshuff32x4
__m256 _mm256_mask_shuffle_f32x4 (__m256 src, __mmask8 k, __m256 a, __m256 b, const int imm8)

Synopsis

__m256 _mm256_mask_shuffle_f32x4 (__m256 src, __mmask8 k, __m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vshuff32x4
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[1]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vshuff32x4
__m256 _mm256_maskz_shuffle_f32x4 (__mmask8 k, __m256 a, __m256 b, const int imm8)

Synopsis

__m256 _mm256_maskz_shuffle_f32x4 (__mmask8 k, __m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vshuff32x4
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[1]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vshuff32x4
__m256 _mm256_shuffle_f32x4 (__m256 a, __m256 b, const int imm8)

Synopsis

__m256 _mm256_shuffle_f32x4 (__m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vshuff32x4
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.

Operation

SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT2(a[255:0], imm8[0]) dst[255:128] := SELECT2(b[255:0], imm8[1]) dst[MAX:256] := 0
vshuff32x4
__m512 _mm512_mask_shuffle_f32x4 (__m512 src, __mmask16 k, __m512 a, __m512 b, const int imm8)

Synopsis

__m512 _mm512_mask_shuffle_f32x4 (__m512 src, __mmask16 k, __m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vshuff32x4 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vshuff32x4
__m512 _mm512_maskz_shuffle_f32x4 (__mmask16 k, __m512 a, __m512 b, const int imm8)

Synopsis

__m512 _mm512_maskz_shuffle_f32x4 (__mmask16 k, __m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vshuff32x4 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vshuff32x4
__m512 _mm512_shuffle_f32x4 (__m512 a, __m512 b, const int imm8)

Synopsis

__m512 _mm512_shuffle_f32x4 (__m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vshuff32x4 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT4(a[511:0], imm8[1:0]) dst[255:128] := SELECT4(a[511:0], imm8[3:2]) dst[383:256] := SELECT4(b[511:0], imm8[5:4]) dst[511:384] := SELECT4(b[511:0], imm8[7:6]) dst[MAX:512] := 0
vshuff64x2
__m256d _mm256_mask_shuffle_f64x2 (__m256d src, __mmask8 k, __m256d a, __m256d b, const int imm8)

Synopsis

__m256d _mm256_mask_shuffle_f64x2 (__m256d src, __mmask8 k, __m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vshuff64x2
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[1]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vshuff64x2
__m256d _mm256_maskz_shuffle_f64x2 (__mmask8 k, __m256d a, __m256d b, const int imm8)

Synopsis

__m256d _mm256_maskz_shuffle_f64x2 (__mmask8 k, __m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vshuff64x2
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[1]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vshuff64x2
__m256d _mm256_shuffle_f64x2 (__m256d a, __m256d b, const int imm8)

Synopsis

__m256d _mm256_shuffle_f64x2 (__m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vshuff64x2
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT2(a[255:0], imm8[0]) dst[255:128] := SELECT2(b[255:0], imm8[1]) dst[MAX:256] := 0
vshuff64x2
__m512d _mm512_mask_shuffle_f64x2 (__m512d src, __mmask8 k, __m512d a, __m512d b, const int imm8)

Synopsis

__m512d _mm512_mask_shuffle_f64x2 (__m512d src, __mmask8 k, __m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vshuff64x2 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vshuff64x2
__m512d _mm512_maskz_shuffle_f64x2 (__mmask8 k, __m512d a, __m512d b, const int imm8)

Synopsis

__m512d _mm512_maskz_shuffle_f64x2 (__mmask8 k, __m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vshuff64x2 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vshuff64x2
__m512d _mm512_shuffle_f64x2 (__m512d a, __m512d b, const int imm8)

Synopsis

__m512d _mm512_shuffle_f64x2 (__m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vshuff64x2 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT4(a[511:0], imm8[1:0]) dst[255:128] := SELECT4(a[511:0], imm8[3:2]) dst[383:256] := SELECT4(b[511:0], imm8[5:4]) dst[511:384] := SELECT4(b[511:0], imm8[7:6]) dst[MAX:512] := 0
vshufi32x4
__m256i _mm256_mask_shuffle_i32x4 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int imm8)

Synopsis

__m256i _mm256_mask_shuffle_i32x4 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi32x4
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vshufi32x4
__m256i _mm256_maskz_shuffle_i32x4 (__mmask8 k, __m256i a, __m256i b, const int imm8)

Synopsis

__m256i _mm256_maskz_shuffle_i32x4 (__mmask8 k, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi32x4
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vshufi32x4
__m256i _mm256_shuffle_i32x4 (__m256i a, __m256i b, const int imm8)

Synopsis

__m256i _mm256_shuffle_i32x4 (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi32x4
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.

Operation

SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT2(a[255:0], imm8[1:0]) dst[255:128] := SELECT2(b[255:0], imm8[3:2]) dst[MAX:256] := 0
vshufi32x4
__m512i _mm512_mask_shuffle_i32x4 (__m512i src, __mmask16 k, __m512i a, __m512i b, const int imm8)

Synopsis

__m512i _mm512_mask_shuffle_i32x4 (__m512i src, __mmask16 k, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi32x4 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vshufi32x4
__m512i _mm512_maskz_shuffle_i32x4 (__mmask16 k, __m512i a, __m512i b, const int imm8)

Synopsis

__m512i _mm512_maskz_shuffle_i32x4 (__mmask16 k, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi32x4 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vshufi32x4
__m512i _mm512_shuffle_i32x4 (__m512i a, __m512i b, const int imm8)

Synopsis

__m512i _mm512_shuffle_i32x4 (__m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi32x4 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT4(a[511:0], imm8[1:0]) dst[255:128] := SELECT4(a[511:0], imm8[3:2]) dst[383:256] := SELECT4(b[511:0], imm8[5:4]) dst[511:384] := SELECT4(b[511:0], imm8[7:6]) dst[MAX:512] := 0
vshufi64x2
__m256i _mm256_mask_shuffle_i64x2 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int imm8)

Synopsis

__m256i _mm256_mask_shuffle_i64x2 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi64x2
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vshufi64x2
__m256i _mm256_maskz_shuffle_i64x2 (__mmask8 k, __m256i a, __m256i b, const int imm8)

Synopsis

__m256i _mm256_maskz_shuffle_i64x2 (__mmask8 k, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi64x2
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vshufi64x2
__m256i _mm256_shuffle_i64x2 (__m256i a, __m256i b, const int imm8)

Synopsis

__m256i _mm256_shuffle_i64x2 (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi64x2
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.

Operation

SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT2(a[255:0], imm8[1:0]) dst[255:128] := SELECT2(b[255:0], imm8[3:2]) dst[MAX:256] := 0
vshufi64x2
__m512i _mm512_mask_shuffle_i64x2 (__m512i src, __mmask8 k, __m512i a, __m512i b, const int imm8)

Synopsis

__m512i _mm512_mask_shuffle_i64x2 (__m512i src, __mmask8 k, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi64x2 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vshufi64x2
__m512i _mm512_maskz_shuffle_i64x2 (__mmask8 k, __m512i a, __m512i b, const int imm8)

Synopsis

__m512i _mm512_maskz_shuffle_i64x2 (__mmask8 k, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi64x2 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vshufi64x2
__m512i _mm512_shuffle_i64x2 (__m512i a, __m512i b, const int imm8)

Synopsis

__m512i _mm512_shuffle_i64x2 (__m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi64x2 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT4(a[511:0], imm8[1:0]) dst[255:128] := SELECT4(a[511:0], imm8[3:2]) dst[383:256] := SELECT4(b[511:0], imm8[5:4]) dst[511:384] := SELECT4(b[511:0], imm8[7:6]) dst[MAX:512] := 0
vshufpd
__m128d _mm_mask_shuffle_pd (__m128d src, __mmask8 k, __m128d a, __m128d b, const int imm8)

Synopsis

__m128d _mm_mask_shuffle_pd (__m128d src, __mmask8 k, __m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vshufpd
__m128d _mm_maskz_shuffle_pd (__mmask8 k, __m128d a, __m128d b, const int imm8)

Synopsis

__m128d _mm_maskz_shuffle_pd (__mmask8 k, __m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
shufpd
__m128d _mm_shuffle_pd (__m128d a, __m128d b, int imm8)

Synopsis

__m128d _mm_shuffle_pd (__m128d a, __m128d b, int imm8)
#include "emmintrin.h"
Instruction: shufpd xmm, xmm, imm
CPUID Flags: SSE2

Description

Shuffle double-precision (64-bit) floating-point elements using the control in imm8, and store the results in dst.

Operation

dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
vshufpd
__m256d _mm256_mask_shuffle_pd (__m256d src, __mmask8 k, __m256d a, __m256d b, const int imm8)

Synopsis

__m256d _mm256_mask_shuffle_pd (__m256d src, __mmask8 k, __m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vshufpd
__m256d _mm256_maskz_shuffle_pd (__mmask8 k, __m256d a, __m256d b, const int imm8)

Synopsis

__m256d _mm256_maskz_shuffle_pd (__mmask8 k, __m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vshufpd
__m256d _mm256_shuffle_pd (__m256d a, __m256d b, const int imm8)

Synopsis

__m256d _mm256_shuffle_pd (__m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.

Operation

dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vshufpd
__m512d _mm512_mask_shuffle_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, const int imm8)

Synopsis

__m512d _mm512_mask_shuffle_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vshufpd
__m512d _mm512_maskz_shuffle_pd (__mmask8 k, __m512d a, __m512d b, const int imm8)

Synopsis

__m512d _mm512_maskz_shuffle_pd (__mmask8 k, __m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vshufpd
__m512d _mm512_shuffle_pd (__m512d a, __m512d b, const int imm8)

Synopsis

__m512d _mm512_shuffle_pd (__m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.

Operation

dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] dst[MAX:512] := 0
pshufw
__m64 _mm_shuffle_pi16 (__m64 a, int imm8)

Synopsis

__m64 _mm_shuffle_pi16 (__m64 a, int imm8)
#include "xmmintrin.h"
Instruction: pshufw mm, mm, imm
CPUID Flags: SSE

Description

Shuffle 16-bit integers in a using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[15:0] := src[15:0] 1: tmp[15:0] := src[31:16] 2: tmp[15:0] := src[47:32] 3: tmp[15:0] := src[63:48] ESAC RETURN tmp[15:0] } dst[15:0] := SELECT4(a[63:0], imm8[1:0]) dst[31:16] := SELECT4(a[63:0], imm8[3:2]) dst[47:32] := SELECT4(a[63:0], imm8[5:4]) dst[63:48] := SELECT4(a[63:0], imm8[7:6])
pshufb
__m64 _mm_shuffle_pi8 (__m64 a, __m64 b)

Synopsis

__m64 _mm_shuffle_pi8 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: pshufb mm, mm
CPUID Flags: SSSE3

Description

Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*8 IF b[i+7] == 1 dst[i+7:i] := 0 ELSE index[2:0] := b[i+2:i] dst[i+7:i] := a[index*8+7:index*8] FI ENDFOR
vshufps
__m128 _mm_mask_shuffle_ps (__m128 src, __mmask8 k, __m128 a, __m128 b, const int imm8)

Synopsis

__m128 _mm_mask_shuffle_ps (__m128 src, __mmask8 k, __m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vshufps
__m128 _mm_maskz_shuffle_ps (__mmask8 k, __m128 a, __m128 b, const int imm8)

Synopsis

__m128 _mm_maskz_shuffle_ps (__mmask8 k, __m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
shufps
__m128 _mm_shuffle_ps (__m128 a, __m128 b, unsigned int imm8)

Synopsis

__m128 _mm_shuffle_ps (__m128 a, __m128 b, unsigned int imm8)
#include "xmmintrin.h"
Instruction: shufps xmm, xmm, imm
CPUID Flags: SSE

Description

Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], imm8[1:0]) dst[63:32] := SELECT4(a[127:0], imm8[3:2]) dst[95:64] := SELECT4(b[127:0], imm8[5:4]) dst[127:96] := SELECT4(b[127:0], imm8[7:6])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vshufps
__m256 _mm256_mask_shuffle_ps (__m256 src, __mmask8 k, __m256 a, __m256 b, const int imm8)

Synopsis

__m256 _mm256_mask_shuffle_ps (__m256 src, __mmask8 k, __m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vshufps
__m256 _mm256_maskz_shuffle_ps (__mmask8 k, __m256 a, __m256 b, const int imm8)

Synopsis

__m256 _mm256_maskz_shuffle_ps (__mmask8 k, __m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps
CPUID Flags: AVX512VL + AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vshufps
__m256 _mm256_shuffle_ps (__m256 a, __m256 b, const int imm8)

Synopsis

__m256 _mm256_shuffle_ps (__m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps ymm, ymm, ymm, imm
CPUID Flags: AVX

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], imm8[1:0]) dst[63:32] := SELECT4(a[127:0], imm8[3:2]) dst[95:64] := SELECT4(b[127:0], imm8[5:4]) dst[127:96] := SELECT4(b[127:0], imm8[7:6]) dst[159:128] := SELECT4(a[255:128], imm8[1:0]) dst[191:160] := SELECT4(a[255:128], imm8[3:2]) dst[223:192] := SELECT4(b[255:128], imm8[5:4]) dst[255:224] := SELECT4(b[255:128], imm8[7:6]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vshufps
__m512 _mm512_mask_shuffle_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, const int imm8)

Synopsis

__m512 _mm512_mask_shuffle_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4]) tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6]) tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4]) tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vshufps
__m512 _mm512_maskz_shuffle_ps (__mmask16 k, __m512 a, __m512 b, const int imm8)

Synopsis

__m512 _mm512_maskz_shuffle_ps (__mmask16 k, __m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4]) tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6]) tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4]) tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vshufps
__m512 _mm512_shuffle_ps (__m512 a, __m512 b, const int imm8)

Synopsis

__m512 _mm512_shuffle_ps (__m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.

Operation

SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], imm8[1:0]) dst[63:32] := SELECT4(a[127:0], imm8[3:2]) dst[95:64] := SELECT4(b[127:0], imm8[5:4]) dst[127:96] := SELECT4(b[127:0], imm8[7:6]) dst[159:128] := SELECT4(a[255:128], imm8[1:0]) dst[191:160] := SELECT4(a[255:128], imm8[3:2]) dst[223:192] := SELECT4(b[255:128], imm8[5:4]) dst[255:224] := SELECT4(b[255:128], imm8[7:6]) dst[287:256] := SELECT4(a[383:256], imm8[1:0]) dst[319:288] := SELECT4(a[383:256], imm8[3:2]) dst[351:320] := SELECT4(b[383:256], imm8[5:4]) dst[383:352] := SELECT4(b[383:256], imm8[7:6]) dst[415:384] := SELECT4(a[511:384], imm8[1:0]) dst[447:416] := SELECT4(a[511:384], imm8[3:2]) dst[479:448] := SELECT4(b[511:384], imm8[5:4]) dst[511:480] := SELECT4(b[511:384], imm8[7:6]) dst[MAX:512] := 0
vpshufhw
__m128i _mm_mask_shufflehi_epi16 (__m128i src, __mmask8 k, __m128i a, int imm8)

Synopsis

__m128i _mm_mask_shufflehi_epi16 (__m128i src, __mmask8 k, __m128i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. Store the results in the high 64 bits of dst, with the low 64 bits being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[63:0] := a[63:0] tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpshufhw
__m128i _mm_maskz_shufflehi_epi16 (__mmask8 k, __m128i a, int imm8)

Synopsis

__m128i _mm_maskz_shufflehi_epi16 (__mmask8 k, __m128i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. Store the results in the high 64 bits of dst, with the low 64 bits being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[63:0] := a[63:0] tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
pshufhw
__m128i _mm_shufflehi_epi16 (__m128i a, int imm8)

Synopsis

__m128i _mm_shufflehi_epi16 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pshufhw xmm, xmm, imm
CPUID Flags: SSE2

Description

Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. Store the results in the high 64 bits of dst, with the low 64 bits being copied from from a to dst.

Operation

dst[63:0] := a[63:0] dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpshufhw
__m256i _mm256_mask_shufflehi_epi16 (__m256i src, __mmask16 k, __m256i a, int imm8)

Synopsis

__m256i _mm256_mask_shufflehi_epi16 (__m256i src, __mmask16 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[63:0] := a[63:0] tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] tmp_dst[191:128] := a[191:128] tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpshufhw
__m256i _mm256_maskz_shufflehi_epi16 (__mmask16 k, __m256i a, int imm8)

Synopsis

__m256i _mm256_maskz_shufflehi_epi16 (__mmask16 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[63:0] := a[63:0] tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] tmp_dst[191:128] := a[191:128] tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpshufhw
__m256i _mm256_shufflehi_epi16 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_shufflehi_epi16 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpshufhw ymm, ymm, imm
CPUID Flags: AVX2

Description

Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst.

Operation

dst[63:0] := a[63:0] dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] dst[191:128] := a[191:128] dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpshufhw
__m512i _mm512_mask_shufflehi_epi16 (__m512i src, __mmask32 k, __m512i a, int imm8)

Synopsis

__m512i _mm512_mask_shufflehi_epi16 (__m512i src, __mmask32 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[63:0] := a[63:0] tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] tmp_dst[191:128] := a[191:128] tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] tmp_dst[319:256] := a[319:256] tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] tmp_dst[447:384] := a[447:384] tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpshufhw
__m512i _mm512_maskz_shufflehi_epi16 (__mmask32 k, __m512i a, int imm8)

Synopsis

__m512i _mm512_maskz_shufflehi_epi16 (__mmask32 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[63:0] := a[63:0] tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] tmp_dst[191:128] := a[191:128] tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] tmp_dst[319:256] := a[319:256] tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] tmp_dst[447:384] := a[447:384] tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpshufhw
__m512i _mm512_shufflehi_epi16 (__m512i a, int imm8)

Synopsis

__m512i _mm512_shufflehi_epi16 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst.

Operation

dst[63:0] := a[63:0] dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] dst[191:128] := a[191:128] dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] dst[319:256] := a[319:256] dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] dst[447:384] := a[447:384] dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] dst[MAX:512] := 0
vpshuflw
__m128i _mm_mask_shufflelo_epi16 (__m128i src, __mmask8 k, __m128i a, int imm8)

Synopsis

__m128i _mm_mask_shufflelo_epi16 (__m128i src, __mmask8 k, __m128i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. Store the results in the low 64 bits of dst, with the high 64 bits being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] tmp_dst[127:64] := a[127:64] FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpshuflw
__m128i _mm_maskz_shufflelo_epi16 (__mmask8 k, __m128i a, int imm8)

Synopsis

__m128i _mm_maskz_shufflelo_epi16 (__mmask8 k, __m128i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. Store the results in the low 64 bits of dst, with the high 64 bits being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] tmp_dst[127:64] := a[127:64] FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
pshuflw
__m128i _mm_shufflelo_epi16 (__m128i a, int imm8)

Synopsis

__m128i _mm_shufflelo_epi16 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pshuflw xmm, xmm, imm
CPUID Flags: SSE2

Description

Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. Store the results in the low 64 bits of dst, with the high 64 bits being copied from from a to dst.

Operation

dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpshuflw
__m256i _mm256_mask_shufflelo_epi16 (__m256i src, __mmask16 k, __m256i a, int imm8)

Synopsis

__m256i _mm256_mask_shufflelo_epi16 (__m256i src, __mmask16 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] tmp_dst[127:64] := a[127:64] tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] tmp_dst[255:192] := a[255:192] FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpshuflw
__m256i _mm256_maskz_shufflelo_epi16 (__mmask16 k, __m256i a, int imm8)

Synopsis

__m256i _mm256_maskz_shufflelo_epi16 (__mmask16 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512VL + AVX512BW

Description

Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] tmp_dst[127:64] := a[127:64] tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] tmp_dst[255:192] := a[255:192] FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpshuflw
__m256i _mm256_shufflelo_epi16 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_shufflelo_epi16 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpshuflw ymm, ymm, imm
CPUID Flags: AVX2

Description

Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst.

Operation

dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] dst[127:64] := a[127:64] dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] dst[255:192] := a[255:192] dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpshuflw
__m512i _mm512_mask_shufflelo_epi16 (__m512i src, __mmask32 k, __m512i a, int imm8)

Synopsis

__m512i _mm512_mask_shufflelo_epi16 (__m512i src, __mmask32 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] tmp_dst[127:64] := a[127:64] tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] tmp_dst[255:192] := a[255:192] tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] tmp_dst[383:320] := a[383:320] tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] tmp_dst[511:448] := a[511:448] FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpshuflw
__m512i _mm512_maskz_shufflelo_epi16 (__mmask32 k, __m512i a, int imm8)

Synopsis

__m512i _mm512_maskz_shufflelo_epi16 (__mmask32 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] tmp_dst[127:64] := a[127:64] tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] tmp_dst[255:192] := a[255:192] tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] tmp_dst[383:320] := a[383:320] tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] tmp_dst[511:448] := a[511:448] FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpshuflw
__m512i _mm512_shufflelo_epi16 (__m512i a, int imm8)

Synopsis

__m512i _mm512_shufflelo_epi16 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512BW

Description

Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst.

Operation

dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] dst[127:64] := a[127:64] dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] dst[255:192] := a[255:192] dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] dst[383:320] := a[383:320] dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] dst[511:448] := a[511:448] dst[MAX:512] := 0
psignw
__m128i _mm_sign_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sign_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: psignw xmm, xmm
CPUID Flags: SSSE3

Description

Negate packed 16-bit integers in a when the corresponding signed 16-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.

Operation

FOR j := 0 to 7 i := j*16 IF b[i+15:i] < 0 dst[i+15:i] := NEG(a[i+15:i]) ELSE IF b[i+15:i] = 0 dst[i+15:i] := 0 ELSE dst[i+15:i] := a[i+15:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpsignw
__m256i _mm256_sign_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_sign_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsignw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Negate packed 16-bit integers in a when the corresponding signed 16-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.

Operation

FOR j := 0 to 15 i := j*16 IF b[i+15:i] < 0 dst[i+15:i] := NEG(a[i+15:i]) ELSE IF b[i+15:i] = 0 dst[i+15:i] := 0 ELSE dst[i+15:i] := a[i+15:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
psignd
__m128i _mm_sign_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sign_epi32 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: psignd xmm, xmm
CPUID Flags: SSSE3

Description

Negate packed 32-bit integers in a when the corresponding signed 32-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.

Operation

FOR j := 0 to 3 i := j*32 IF b[i+31:i] < 0 dst[i+31:i] := NEG(a[i+31:i]) ELSE IF b[i+31:i] = 0 dst[i+31:i] := 0 ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpsignd
__m256i _mm256_sign_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_sign_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsignd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Negate packed 32-bit integers in a when the corresponding signed 32-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.

Operation

FOR j := 0 to 7 i := j*32 IF b[i+31:i] < 0 dst[i+31:i] := NEG(a[i+31:i]) ELSE IF b[i+31:i] = 0 dst[i+31:i] := 0 ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
psignb
__m128i _mm_sign_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sign_epi8 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: psignb xmm, xmm
CPUID Flags: SSSE3

Description

Negate packed 8-bit integers in a when the corresponding signed 8-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.

Operation

FOR j := 0 to 15 i := j*8 IF b[i+7:i] < 0 dst[i+7:i] := NEG(a[i+7:i]) ELSE IF b[i+7:i] = 0 dst[i+7:i] := 0 ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpsignb
__m256i _mm256_sign_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_sign_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsignb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Negate packed 8-bit integers in a when the corresponding signed 8-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.

Operation

FOR j := 0 to 31 i := j*8 IF b[i+7:i] < 0 dst[i+7:i] := NEG(a[i+7:i]) ELSE IF b[i+7:i] = 0 dst[i+7:i] := 0 ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell10.5
psignw
__m64 _mm_sign_pi16 (__m64 a, __m64 b)

Synopsis

__m64 _mm_sign_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: psignw mm, mm
CPUID Flags: SSSE3

Description

Negate packed 16-bit integers in a when the corresponding signed 16-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.

Operation

FOR j := 0 to 3 i := j*16 IF b[i+15:i] < 0 dst[i+15:i] := NEG(a[i+15:i]) ELSE IF b[i+15:i] = 0 dst[i+15:i] := 0 ELSE dst[i+15:i] := a[i+15:i] FI ENDFOR
psignd
__m64 _mm_sign_pi32 (__m64 a, __m64 b)

Synopsis

__m64 _mm_sign_pi32 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: psignd mm, mm
CPUID Flags: SSSE3

Description

Negate packed 32-bit integers in a when the corresponding signed 32-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.

Operation

FOR j := 0 to 1 i := j*32 IF b[i+31:i] < 0 dst[i+31:i] := NEG(a[i+31:i]) ELSE IF b[i+31:i] = 0 dst[i+31:i] := 0 ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR
psignb
__m64 _mm_sign_pi8 (__m64 a, __m64 b)

Synopsis

__m64 _mm_sign_pi8 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: psignb mm, mm
CPUID Flags: SSSE3

Description

Negate packed 8-bit integers in a when the corresponding signed 8-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.

Operation

FOR j := 0 to 7 i := j*8 IF b[i+7:i] < 0 dst[i+7:i] := NEG(a[i+7:i]) ELSE IF b[i+7:i] = 0 dst[i+7:i] := 0 ELSE dst[i+7:i] := a[i+7:i] FI ENDFOR
...
__m128d _mm_sin_pd (__m128d a)

Synopsis

__m128d _mm_sin_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := SIN(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_sin_pd (__m256d a)

Synopsis

__m256d _mm256_sin_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := SIN(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_mask_sin_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_sin_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SIN(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_sin_pd (__m512d a)

Synopsis

__m512d _mm512_sin_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := SIN(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128 _mm_sin_ps (__m128 a)

Synopsis

__m128 _mm_sin_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := SIN(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_sin_ps (__m256 a)

Synopsis

__m256 _mm256_sin_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := SIN(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_mask_sin_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_sin_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SIN(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_sin_ps (__m512 a)

Synopsis

__m512 _mm512_sin_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := SIN(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m128d _mm_sincos_pd (__m128d * mem_addr, __m128d a)

Synopsis

__m128d _mm_sincos_pd (__m128d * mem_addr, __m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, store the sine in dst, and store the cosine into memory at mem_addr.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := SIN(a[i+63:i]) MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_sincos_pd (__m256d * mem_addr, __m256d a)

Synopsis

__m256d _mm256_sincos_pd (__m256d * mem_addr, __m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, store the sine in dst, and store the cosine into memory at mem_addr.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := SIN(a[i+63:i]) MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_mask_sincos_pd (__m512d * cos_res, __m512d sin_src, __m512d cos_src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_sincos_pd (__m512d * cos_res, __m512d sin_src, __m512d cos_src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Computes the sine and cosine of the packed double-precision (64-bit) floating-point elements in a and stores the results of the sine computation in dst and the results of the cosine computation in cos_res. Elements are written to their respective locations using writemask k (elements are copied from sin_src or cos_src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SIN(a[i+63:i]) cos_res[i+63:i] := COS(a[i+63:i]) ELSE dst[i+63:i] := sin_src[i+63:i] cos_res[i+63:i] := cos_src[i+63:i] FI ENDFOR dst[MAX:512] := 0 cos_res[MAX:512] := 0
...
__m512d _mm512_sincos_pd (__m512d * cos_res, __m512d a)

Synopsis

__m512d _mm512_sincos_pd (__m512d * cos_res, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Computes the sine and cosine of the packed double-precision (64-bit) floating-point elements in a and stores the results of the sine computation in dst and the results of the cosine computation in cos_res.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := SIN(a[i+63:i]) cos_res[i+63:i] := COS(a[i+63:i]) ENDFOR dst[MAX:512] := 0 cos_res[MAX:512] := 0
...
__m128 _mm_sincos_ps (__m128 * mem_addr, __m128 a)

Synopsis

__m128 _mm_sincos_ps (__m128 * mem_addr, __m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, store the sine in dst, and store the cosine into memory at mem_addr.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := SIN(a[i+31:i]) MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_sincos_ps (__m256 * mem_addr, __m256 a)

Synopsis

__m256 _mm256_sincos_ps (__m256 * mem_addr, __m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, store the sine in dst, and store the cosine into memory at mem_addr.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := SIN(a[i+31:i]) MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_mask_sincos_ps (__m512 * cos_res, __m512 sin_src, __m512 cos_src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_sincos_ps (__m512 * cos_res, __m512 sin_src, __m512 cos_src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Computes the sine and cosine of the packed single-precision (32-bit) floating-point elements in a and stores the results of the sine computation in dst and the results of the cosine computation in cos_res. Elements are written to their respective locations using writemask k (elements are copied from sin_src or cos_src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SIN(a[i+31:i]) cos_res[i+31:i] := COS(a[i+31:i]) ELSE dst[i+31:i] := sin_src[i+31:i] cos_res[i+31:i] := cos_src[i+31:i] FI ENDFOR dst[MAX:512] := 0 cos_res[MAX:512] := 0
...
__m512 _mm512_sincos_ps (__m512 * cos_res, __m512 a)

Synopsis

__m512 _mm512_sincos_ps (__m512 * cos_res, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Computes the sine and cosine of the packed single-precision (32-bit) floating-point elements in a and stores the results of the sine computation in dst and the results of the cosine computation in cos_res.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := SIN(a[i+31:i]) cos_res[i+31:i] := COS(a[i+31:i]) ENDFOR dst[MAX:512] := 0 cos_res[MAX:512] := 0
...
__m128d _mm_sind_pd (__m128d a)

Synopsis

__m128d _mm_sind_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := SIND(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_sind_pd (__m256d a)

Synopsis

__m256d _mm256_sind_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := SIND(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_mask_sind_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_sind_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SIND(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_sind_pd (__m512d a)

Synopsis

__m512d _mm512_sind_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := SIND(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128 _mm_sind_ps (__m128 a)

Synopsis

__m128 _mm_sind_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := SIND(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_sind_ps (__m256 a)

Synopsis

__m256 _mm256_sind_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := SIND(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_mask_sind_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_sind_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SIND(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_sind_ps (__m512 a)

Synopsis

__m512 _mm512_sind_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := SIND(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m128d _mm_sinh_pd (__m128d a)

Synopsis

__m128d _mm_sinh_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := SINH(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_sinh_pd (__m256d a)

Synopsis

__m256d _mm256_sinh_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := SINH(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_mask_sinh_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_sinh_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SINH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_sinh_pd (__m512d a)

Synopsis

__m512d _mm512_sinh_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := SINH(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128 _mm_sinh_ps (__m128 a)

Synopsis

__m128 _mm_sinh_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := SINH(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_sinh_ps (__m256 a)

Synopsis

__m256 _mm256_sinh_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := SINH(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_mask_sinh_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_sinh_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SINH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_sinh_ps (__m512 a)

Synopsis

__m512 _mm512_sinh_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := SINH(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vpsllw
__m128i _mm_mask_sll_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_sll_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpsllw
__m128i _mm_maskz_sll_epi16 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_sll_epi16 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
psllw
__m128i _mm_sll_epi16 (__m128i a, __m128i count)

Synopsis

__m128i _mm_sll_epi16 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psllw xmm, xmm
CPUID Flags: SSE2

Description

Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Westmere2-
Nehalem2-
vpsllw
__m256i _mm256_mask_sll_epi16 (__m256i src, __mmask16 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_mask_sll_epi16 (__m256i src, __mmask16 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsllw
__m256i _mm256_maskz_sll_epi16 (__mmask16 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_maskz_sll_epi16 (__mmask16 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsllw
__m256i _mm256_sll_epi16 (__m256i a, __m128i count)

Synopsis

__m256i _mm256_sll_epi16 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw ymm, ymm, xmm
CPUID Flags: AVX2

Description

Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell40.5
Haswell4-
vpsllw
__m512i _mm512_mask_sll_epi16 (__m512i src, __mmask32 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_mask_sll_epi16 (__m512i src, __mmask32 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsllw
__m512i _mm512_maskz_sll_epi16 (__mmask32 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_maskz_sll_epi16 (__mmask32 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsllw
__m512i _mm512_sll_epi16 (__m512i a, __m128i count)

Synopsis

__m512i _mm512_sll_epi16 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0]) FI ENDFOR dst[MAX:512] := 0
vpslld
__m128i _mm_mask_sll_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_sll_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpslld
__m128i _mm_maskz_sll_epi32 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_sll_epi32 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
pslld
__m128i _mm_sll_epi32 (__m128i a, __m128i count)

Synopsis

__m128i _mm_sll_epi32 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: pslld xmm, xmm
CPUID Flags: SSE2

Description

Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Westmere2-
Nehalem2-
vpslld
__m256i _mm256_mask_sll_epi32 (__m256i src, __mmask8 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_mask_sll_epi32 (__m256i src, __mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpslld
__m256i _mm256_maskz_sll_epi32 (__mmask8 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_maskz_sll_epi32 (__mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpslld
__m256i _mm256_sll_epi32 (__m256i a, __m128i count)

Synopsis

__m256i _mm256_sll_epi32 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld ymm, ymm, xmm
CPUID Flags: AVX2

Description

Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell40.5
vpslld
__m512i _mm512_mask_sll_epi32 (__m512i src, __mmask16 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_mask_sll_epi32 (__m512i src, __mmask16 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpslld
__m512i _mm512_maskz_sll_epi32 (__mmask16 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_maskz_sll_epi32 (__mmask16 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpslld
__m512i _mm512_sll_epi32 (__m512i a, __m128i count)

Synopsis

__m512i _mm512_sll_epi32 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ENDFOR dst[MAX:512] := 0
vpsllq
__m128i _mm_mask_sll_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_sll_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpsllq
__m128i _mm_maskz_sll_epi64 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_sll_epi64 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
psllq
__m128i _mm_sll_epi64 (__m128i a, __m128i count)

Synopsis

__m128i _mm_sll_epi64 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psllq xmm, xmm
CPUID Flags: SSE2

Description

Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Westmere2-
Nehalem2-
vpsllq
__m256i _mm256_mask_sll_epi64 (__m256i src, __mmask8 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_mask_sll_epi64 (__m256i src, __mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpsllq
__m256i _mm256_maskz_sll_epi64 (__mmask8 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_maskz_sll_epi64 (__mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsllq
__m256i _mm256_sll_epi64 (__m256i a, __m128i count)

Synopsis

__m256i _mm256_sll_epi64 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq ymm, ymm, xmm
CPUID Flags: AVX2

Description

Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell40.5
vpsllq
__m512i _mm512_mask_sll_epi64 (__m512i src, __mmask8 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_mask_sll_epi64 (__m512i src, __mmask8 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpsllq
__m512i _mm512_maskz_sll_epi64 (__mmask8 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_maskz_sll_epi64 (__mmask8 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsllq
__m512i _mm512_sll_epi64 (__m512i a, __m128i count)

Synopsis

__m512i _mm512_sll_epi64 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ENDFOR dst[MAX:512] := 0
vpsllw
__m128i _mm_mask_slli_epi16 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_mask_slli_epi16 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpsllw
__m128i _mm_maskz_slli_epi16 (__mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_maskz_slli_epi16 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
psllw
__m128i _mm_slli_epi16 (__m128i a, int imm8)

Synopsis

__m128i _mm_slli_epi16 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psllw xmm, imm
CPUID Flags: SSE2

Description

Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vpsllw
__m256i _mm256_mask_slli_epi16 (__m256i src, __mmask16 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_mask_slli_epi16 (__m256i src, __mmask16 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsllw
__m256i _mm256_maskz_slli_epi16 (__mmask16 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_maskz_slli_epi16 (__mmask16 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsllw
__m256i _mm256_slli_epi16 (__m256i a, int imm8)

Synopsis

__m256i _mm256_slli_epi16 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsllw ymm, ymm, imm
CPUID Flags: AVX2

Description

Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
Haswell1-
vpsllw
__m512i _mm512_mask_slli_epi16 (__m512i src, __mmask32 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_mask_slli_epi16 (__m512i src, __mmask32 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsllw
__m512i _mm512_maskz_slli_epi16 (__mmask32 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_maskz_slli_epi16 (__mmask32 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsllw
__m512i _mm512_slli_epi16 (__m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_slli_epi16 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0]) FI ENDFOR dst[MAX:512] := 0
vpslld
__m128i _mm_mask_slli_epi32 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_mask_slli_epi32 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpslld
__m128i _mm_maskz_slli_epi32 (__mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_maskz_slli_epi32 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
pslld
__m128i _mm_slli_epi32 (__m128i a, int imm8)

Synopsis

__m128i _mm_slli_epi32 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pslld xmm, imm
CPUID Flags: SSE2

Description

Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vpslld
__m256i _mm256_mask_slli_epi32 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_mask_slli_epi32 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpslld
__m256i _mm256_maskz_slli_epi32 (__mmask8 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_maskz_slli_epi32 (__mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpslld
__m256i _mm256_slli_epi32 (__m256i a, int imm8)

Synopsis

__m256i _mm256_slli_epi32 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpslld ymm, ymm, imm
CPUID Flags: AVX2

Description

Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpslld
__m512i _mm512_mask_slli_epi32 (__m512i src, __mmask16 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_mask_slli_epi32 (__m512i src, __mmask16 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpslld
__m512i _mm512_maskz_slli_epi32 (__mmask16 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_maskz_slli_epi32 (__mmask16 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpslld
__m512i _mm512_slli_epi32 (__m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_slli_epi32 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ENDFOR dst[MAX:512] := 0
vpsllq
__m128i _mm_mask_slli_epi64 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_mask_slli_epi64 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpsllq
__m128i _mm_maskz_slli_epi64 (__mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_maskz_slli_epi64 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
psllq
__m128i _mm_slli_epi64 (__m128i a, int imm8)

Synopsis

__m128i _mm_slli_epi64 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psllq xmm, imm
CPUID Flags: SSE2

Description

Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vpsllq
__m256i _mm256_mask_slli_epi64 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_mask_slli_epi64 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpsllq
__m256i _mm256_maskz_slli_epi64 (__mmask8 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_maskz_slli_epi64 (__mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsllq
__m256i _mm256_slli_epi64 (__m256i a, int imm8)

Synopsis

__m256i _mm256_slli_epi64 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsllq ymm, ymm, imm
CPUID Flags: AVX2

Description

Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsllq
__m512i _mm512_mask_slli_epi64 (__m512i src, __mmask8 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_mask_slli_epi64 (__m512i src, __mmask8 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpsllq
__m512i _mm512_maskz_slli_epi64 (__mmask8 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_maskz_slli_epi64 (__mmask8 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsllq
__m512i _mm512_slli_epi64 (__m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_slli_epi64 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ENDFOR dst[MAX:512] := 0
pslldq
__m128i _mm_slli_si128 (__m128i a, int imm8)

Synopsis

__m128i _mm_slli_si128 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pslldq xmm, imm
CPUID Flags: SSE2

Description

Shift a left by imm8 bytes while shifting in zeros, and store the results in dst.

Operation

tmp := imm8[7:0] IF tmp > 15 tmp := 16 FI dst[127:0] := a[127:0] << (tmp*8)

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpslldq
__m256i _mm256_slli_si256 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_slli_si256 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpslldq ymm, ymm, imm
CPUID Flags: AVX2

Description

Shift 128-bit lanes in a left by imm8 bytes while shifting in zeros, and store the results in dst.

Operation

tmp := imm8[7:0] IF tmp > 15 tmp := 16 FI dst[127:0] := a[127:0] << (tmp*8) dst[255:128] := a[255:128] << (tmp*8) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsllvw
__m128i _mm_mask_sllv_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_sllv_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpsllvw
__m128i _mm_maskz_sllv_epi16 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_sllv_epi16 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsllvw
__m128i _mm_sllv_epi16 (__m128i a, __m128i count)

Synopsis

__m128i _mm_sllv_epi16 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsllvw
__m256i _mm256_mask_sllv_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_mask_sllv_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsllvw
__m256i _mm256_maskz_sllv_epi16 (__mmask16 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_maskz_sllv_epi16 (__mmask16 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsllvw
__m256i _mm256_sllv_epi16 (__m256i a, __m256i count)

Synopsis

__m256i _mm256_sllv_epi16 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsllvw
__m512i _mm512_mask_sllv_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_mask_sllv_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsllvw
__m512i _mm512_maskz_sllv_epi16 (__mmask32 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_maskz_sllv_epi16 (__mmask32 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsllvw
__m512i _mm512_sllv_epi16 (__m512i a, __m512i count)

Synopsis

__m512i _mm512_sllv_epi16 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsllvd
__m128i _mm_mask_sllv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_sllv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvd
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpsllvd
__m128i _mm_maskz_sllv_epi32 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_sllv_epi32 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvd
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsllvd
__m128i _mm_sllv_epi32 (__m128i a, __m128i count)

Synopsis

__m128i _mm_sllv_epi32 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvd xmm, xmm, xmm
CPUID Flags: AVX2

Description

Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell22
vpsllvd
__m256i _mm256_mask_sllv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_mask_sllv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvd
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpsllvd
__m256i _mm256_maskz_sllv_epi32 (__mmask8 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_maskz_sllv_epi32 (__mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvd
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsllvd
__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)

Synopsis

__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell22
vpsllvd
__m512i _mm512_mask_sllv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_mask_sllv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpsllvd
__m512i _mm512_maskz_sllv_epi32 (__mmask16 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_maskz_sllv_epi32 (__mmask16 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsllvd
__m512i _mm512_sllv_epi32 (__m512i a, __m512i count)

Synopsis

__m512i _mm512_sllv_epi32 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ENDFOR dst[MAX:512] := 0
vpsllvq
__m128i _mm_mask_sllv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_sllv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpsllvq
__m128i _mm_maskz_sllv_epi64 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_sllv_epi64 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsllvq
__m128i _mm_sllv_epi64 (__m128i a, __m128i count)

Synopsis

__m128i _mm_sllv_epi64 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvq xmm, xmm, xmm
CPUID Flags: AVX2

Description

Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsllvq
__m256i _mm256_mask_sllv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_mask_sllv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpsllvq
__m256i _mm256_maskz_sllv_epi64 (__mmask8 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_maskz_sllv_epi64 (__mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsllvq
__m256i _mm256_sllv_epi64 (__m256i a, __m256i count)

Synopsis

__m256i _mm256_sllv_epi64 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsllvq
__m512i _mm512_mask_sllv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_mask_sllv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpsllvq
__m512i _mm512_maskz_sllv_epi64 (__mmask8 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_maskz_sllv_epi64 (__mmask8 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsllvq
__m512i _mm512_sllv_epi64 (__m512i a, __m512i count)

Synopsis

__m512i _mm512_sllv_epi64 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ENDFOR dst[MAX:512] := 0
spflt
void _mm_spflt_32 (unsigned int r1)

Synopsis

void _mm_spflt_32 (unsigned int r1)
#include "immintrin.h"
Instruction: spflt r
CPUID Flags: KNCNI

Description

Set performance monitoring filtering mask to 32-bit unsigned integer r1.

Operation

SetPerfMonMask(r1[31:0])
spflt
void _mm_spflt_64 (unsigned __int64 r1)

Synopsis

void _mm_spflt_64 (unsigned __int64 r1)
#include "immintrin.h"
Instruction: spflt r
CPUID Flags: KNCNI

Description

Set performance monitoring filtering mask to 64-bit unsigned integer r1.

Operation

SetPerfMonMask(r1[63:0])
vsqrtpd
__m128d _mm_mask_sqrt_pd (__m128d src, __mmask8 k, __m128d a)

Synopsis

__m128d _mm_mask_sqrt_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vsqrtpd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vsqrtpd
__m128d _mm_maskz_sqrt_pd (__mmask8 k, __m128d a)

Synopsis

__m128d _mm_maskz_sqrt_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vsqrtpd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
sqrtpd
__m128d _mm_sqrt_pd (__m128d a)

Synopsis

__m128d _mm_sqrt_pd (__m128d a)
#include "emmintrin.h"
Instruction: sqrtpd xmm, xmm
CPUID Flags: SSE2

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := SQRT(a[i+63:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell2013
Ivy Bridge2014
Sandy Bridge2122
Westmere3230
Nehalem3230
vsqrtpd
__m256d _mm256_mask_sqrt_pd (__m256d src, __mmask8 k, __m256d a)

Synopsis

__m256d _mm256_mask_sqrt_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vsqrtpd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vsqrtpd
__m256d _mm256_maskz_sqrt_pd (__mmask8 k, __m256d a)

Synopsis

__m256d _mm256_maskz_sqrt_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vsqrtpd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vsqrtpd
__m256d _mm256_sqrt_pd (__m256d a)

Synopsis

__m256d _mm256_sqrt_pd (__m256d a)
#include "immintrin.h"
Instruction: vsqrtpd ymm, ymm
CPUID Flags: AVX

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := SQRT(a[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell3528
Ivy Bridge3528
Sandy Bridge4344
vsqrtpd
__m512d _mm512_mask_sqrt_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_sqrt_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vsqrtpd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vsqrtpd
__m512d _mm512_maskz_sqrt_pd (__mmask8 k, __m512d a)

Synopsis

__m512d _mm512_maskz_sqrt_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vsqrtpd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vsqrtpd
__m512d _mm512_sqrt_pd (__m512d a)

Synopsis

__m512d _mm512_sqrt_pd (__m512d a)
#include "immintrin.h"
Instruction: vsqrtpd zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := SQRT(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vsqrtps
__m128 _mm_mask_sqrt_ps (__m128 src, __mmask8 k, __m128 a)

Synopsis

__m128 _mm_mask_sqrt_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vsqrtps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vsqrtps
__m128 _mm_maskz_sqrt_ps (__mmask8 k, __m128 a)

Synopsis

__m128 _mm_maskz_sqrt_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vsqrtps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
sqrtps
__m128 _mm_sqrt_ps (__m128 a)

Synopsis

__m128 _mm_sqrt_ps (__m128 a)
#include "xmmintrin.h"
Instruction: sqrtps xmm, xmm
CPUID Flags: SSE

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell187
Ivy Bridge207
Sandy Bridge2214
Westmere2516
Nehalem2516
vsqrtps
__m256 _mm256_mask_sqrt_ps (__m256 src, __mmask8 k, __m256 a)

Synopsis

__m256 _mm256_mask_sqrt_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vsqrtps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vsqrtps
__m256 _mm256_maskz_sqrt_ps (__mmask8 k, __m256 a)

Synopsis

__m256 _mm256_maskz_sqrt_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vsqrtps
CPUID Flags: AVX512VL + AVX512F

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vsqrtps
__m256 _mm256_sqrt_ps (__m256 a)

Synopsis

__m256 _mm256_sqrt_ps (__m256 a)
#include "immintrin.h"
Instruction: vsqrtps ymm, ymm
CPUID Flags: AVX

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell2114
Ivy Bridge2114
Sandy Bridge2928
vsqrtps
__m512 _mm512_mask_sqrt_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_sqrt_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vsqrtps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vsqrtps
__m512 _mm512_maskz_sqrt_ps (__mmask16 k, __m512 a)

Synopsis

__m512 _mm512_maskz_sqrt_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vsqrtps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vsqrtps
__m512 _mm512_sqrt_ps (__m512 a)

Synopsis

__m512 _mm512_sqrt_ps (__m512 a)
#include "immintrin.h"
Instruction: vsqrtps zmm {k}, zmm
CPUID Flags: AVX512F

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vsqrtpd
__m512d _mm512_mask_sqrt_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)

Synopsis

__m512d _mm512_mask_sqrt_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vsqrtpd zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vsqrtpd
__m512d _mm512_maskz_sqrt_round_pd (__mmask8 k, __m512d a, int rounding)

Synopsis

__m512d _mm512_maskz_sqrt_round_pd (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vsqrtpd zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vsqrtpd
__m512d _mm512_sqrt_round_pd (__m512d a, int rounding)

Synopsis

__m512d _mm512_sqrt_round_pd (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vsqrtpd zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := SQRT(a[i+63:i]) ENDFOR dst[MAX:512] := 0
vsqrtps
__m512 _mm512_mask_sqrt_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)

Synopsis

__m512 _mm512_mask_sqrt_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vsqrtps zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vsqrtps
__m512 _mm512_maskz_sqrt_round_ps (__mmask16 k, __m512 a, int rounding)

Synopsis

__m512 _mm512_maskz_sqrt_round_ps (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vsqrtps zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vsqrtps
__m512 _mm512_sqrt_round_ps (__m512 a, int rounding)

Synopsis

__m512 _mm512_sqrt_round_ps (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vsqrtps zmm {k}, zmm {er}
CPUID Flags: AVX512F

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vsqrtsd
__m128d _mm_mask_sqrt_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_mask_sqrt_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vsqrtsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Compute the square root of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := SQRT(a[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vsqrtsd
__m128d _mm_maskz_sqrt_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_maskz_sqrt_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vsqrtsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Compute the square root of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := SQRT(a[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vsqrtsd
__m128d _mm_sqrt_round_sd (__m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_sqrt_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vsqrtsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Compute the square root of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := SQRT(a[63:0]) dst[127:64] := b[127:64] dst[MAX:128] := 0
vsqrtss
__m128 _mm_mask_sqrt_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_mask_sqrt_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vsqrtss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Compute the square root of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := SQRT(a[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vsqrtss
__m128 _mm_maskz_sqrt_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_maskz_sqrt_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vsqrtss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Compute the square root of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := SQRT(a[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vsqrtss
__m128 _mm_sqrt_round_ss (__m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_sqrt_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vsqrtss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Compute the square root of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from b to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := SQRT(a[31:0]) dst[127:32] := b[127:32] dst[MAX:128] := 0
vsqrtsd
__m128d _mm_mask_sqrt_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_sqrt_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vsqrtsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the square root of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.

Operation

IF k[0] dst[63:0] := SQRT(a[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
vsqrtsd
__m128d _mm_maskz_sqrt_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_sqrt_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vsqrtsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the square root of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.

Operation

IF k[0] dst[63:0] := SQRT(a[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
sqrtsd
__m128d _mm_sqrt_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_sqrt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: sqrtsd xmm, xmm
CPUID Flags: SSE2

Description

Compute the square root of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst.

Operation

dst[63:0] := SQRT(a[63:0]) dst[127:64] := b[127:64]

Performance

ArchitectureLatencyThroughput
Haswell2013
Ivy Bridge2014
Sandy Bridge2122
Westmere3230
Nehalem3230
vsqrtss
__m128 _mm_mask_sqrt_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_sqrt_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vsqrtss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the square root of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.

Operation

IF k[0] dst[31:0] := SQRT(a[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
vsqrtss
__m128 _mm_maskz_sqrt_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_sqrt_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vsqrtss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Compute the square root of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.

Operation

IF k[0] dst[31:0] := SQRT(a[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
sqrtss
__m128 _mm_sqrt_ss (__m128 a)

Synopsis

__m128 _mm_sqrt_ss (__m128 a)
#include "xmmintrin.h"
Instruction: sqrtss xmm, xmm
CPUID Flags: SSE

Description

Compute the square root of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := SQRT(a[31:0]) dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell187
Ivy Bridge207
Sandy Bridge2214
Westmere2516
Nehalem2516
vpsraw
__m128i _mm_mask_sra_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_sra_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpsraw
__m128i _mm_maskz_sra_epi16 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_sra_epi16 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
psraw
__m128i _mm_sra_epi16 (__m128i a, __m128i count)

Synopsis

__m128i _mm_sra_epi16 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psraw xmm, xmm
CPUID Flags: SSE2

Description

Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 IF count[63:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Westmere2-
Nehalem2-
vpsraw
__m256i _mm256_mask_sra_epi16 (__m256i src, __mmask16 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_mask_sra_epi16 (__m256i src, __mmask16 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsraw
__m256i _mm256_maskz_sra_epi16 (__mmask16 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_maskz_sra_epi16 (__mmask16 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsraw
__m256i _mm256_sra_epi16 (__m256i a, __m128i count)

Synopsis

__m256i _mm256_sra_epi16 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw ymm, ymm, xmm
CPUID Flags: AVX2

Description

Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 IF count[63:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell4-
vpsraw
__m512i _mm512_mask_sra_epi16 (__m512i src, __mmask32 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_mask_sra_epi16 (__m512i src, __mmask32 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsraw
__m512i _mm512_maskz_sra_epi16 (__mmask32 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_maskz_sra_epi16 (__mmask32 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsraw
__m512i _mm512_sra_epi16 (__m512i a, __m128i count)

Synopsis

__m512i _mm512_sra_epi16 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 IF count[63:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0]) FI ENDFOR dst[MAX:512] := 0
vpsrad
__m128i _mm_mask_sra_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_sra_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpsrad
__m128i _mm_maskz_sra_epi32 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_sra_epi32 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
psrad
__m128i _mm_sra_epi32 (__m128i a, __m128i count)

Synopsis

__m128i _mm_sra_epi32 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psrad xmm, xmm
CPUID Flags: SSE2

Description

Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell21
Ivy Bridge11
Ivy Bridge21
Sandy Bridge11
Sandy Bridge21
Westmere21
Nehalem21
vpsrad
__m256i _mm256_mask_sra_epi32 (__m256i src, __mmask8 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_mask_sra_epi32 (__m256i src, __mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpsrad
__m256i _mm256_maskz_sra_epi32 (__mmask8 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_maskz_sra_epi32 (__mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsrad
__m256i _mm256_sra_epi32 (__m256i a, __m128i count)

Synopsis

__m256i _mm256_sra_epi32 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad ymm, ymm, xmm
CPUID Flags: AVX2

Description

Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell4-
vpsrad
__m512i _mm512_mask_sra_epi32 (__m512i src, __mmask16 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_mask_sra_epi32 (__m512i src, __mmask16 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpsrad
__m512i _mm512_maskz_sra_epi32 (__mmask16 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_maskz_sra_epi32 (__mmask16 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsrad
__m512i _mm512_sra_epi32 (__m512i a, __m128i count)

Synopsis

__m512i _mm512_sra_epi32 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ENDFOR dst[MAX:512] := 0
vpsraq
__m128i _mm_mask_sra_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_sra_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpsraq
__m128i _mm_maskz_sra_epi64 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_sra_epi64 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsraq
__m128i _mm_sra_epi64 (__m128i a, __m128i count)

Synopsis

__m128i _mm_sra_epi64 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ENDFOR dst[MAX:128] := 0
vpsraq
__m256i _mm256_mask_sra_epi64 (__m256i src, __mmask8 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_mask_sra_epi64 (__m256i src, __mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpsraq
__m256i _mm256_maskz_sra_epi64 (__mmask8 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_maskz_sra_epi64 (__mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsraq
__m256i _mm256_sra_epi64 (__m256i a, __m128i count)

Synopsis

__m256i _mm256_sra_epi64 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ENDFOR dst[MAX:256] := 0
vpsraq
__m512i _mm512_mask_sra_epi64 (__m512i src, __mmask8 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_mask_sra_epi64 (__m512i src, __mmask8 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpsraq
__m512i _mm512_maskz_sra_epi64 (__mmask8 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_maskz_sra_epi64 (__mmask8 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsraq
__m512i _mm512_sra_epi64 (__m512i a, __m128i count)

Synopsis

__m512i _mm512_sra_epi64 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ENDFOR dst[MAX:512] := 0
vpsraw
__m128i _mm_mask_srai_epi16 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_mask_srai_epi16 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpsraw
__m128i _mm_maskz_srai_epi16 (__mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_maskz_srai_epi16 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
psraw
__m128i _mm_srai_epi16 (__m128i a, int imm8)

Synopsis

__m128i _mm_srai_epi16 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psraw xmm, imm
CPUID Flags: SSE2

Description

Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 IF imm8[7:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vpsraw
__m256i _mm256_mask_srai_epi16 (__m256i src, __mmask16 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_mask_srai_epi16 (__m256i src, __mmask16 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsraw
__m256i _mm256_maskz_srai_epi16 (__mmask16 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_maskz_srai_epi16 (__mmask16 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsraw
__m256i _mm256_srai_epi16 (__m256i a, int imm8)

Synopsis

__m256i _mm256_srai_epi16 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsraw ymm, ymm, imm
CPUID Flags: AVX2

Description

Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 IF imm8[7:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsraw
__m512i _mm512_mask_srai_epi16 (__m512i src, __mmask32 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_mask_srai_epi16 (__m512i src, __mmask32 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsraw
__m512i _mm512_maskz_srai_epi16 (__mmask32 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_maskz_srai_epi16 (__mmask32 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsraw
__m512i _mm512_srai_epi16 (__m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_srai_epi16 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 IF imm8[7:0] > 15 dst[i+15:i] := SignBit ELSE dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0]) FI ENDFOR dst[MAX:512] := 0
vpsrad
__m128i _mm_mask_srai_epi32 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_mask_srai_epi32 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpsrad
__m128i _mm_maskz_srai_epi32 (__mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_maskz_srai_epi32 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
psrad
__m128i _mm_srai_epi32 (__m128i a, int imm8)

Synopsis

__m128i _mm_srai_epi32 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psrad xmm, imm
CPUID Flags: SSE2

Description

Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
vpsrad
__m256i _mm256_mask_srai_epi32 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_mask_srai_epi32 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpsrad
__m256i _mm256_maskz_srai_epi32 (__mmask8 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_maskz_srai_epi32 (__mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsrad
__m256i _mm256_srai_epi32 (__m256i a, int imm8)

Synopsis

__m256i _mm256_srai_epi32 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsrad ymm, ymm, imm
CPUID Flags: AVX2

Description

Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsrad
__m512i _mm512_mask_srai_epi32 (__m512i src, __mmask16 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_mask_srai_epi32 (__m512i src, __mmask16 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpsrad
__m512i _mm512_maskz_srai_epi32 (__mmask16 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_maskz_srai_epi32 (__mmask16 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsrad
__m512i _mm512_srai_epi32 (__m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_srai_epi32 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ENDFOR dst[MAX:512] := 0
vpsraq
__m128i _mm_mask_srai_epi64 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_mask_srai_epi64 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpsraq
__m128i _mm_maskz_srai_epi64 (__mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_maskz_srai_epi64 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsraq
__m128i _mm_srai_epi64 (__m128i a, unsigned int imm8)

Synopsis

__m128i _mm_srai_epi64 (__m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ENDFOR dst[MAX:128] := 0
vpsraq
__m256i _mm256_mask_srai_epi64 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_mask_srai_epi64 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpsraq
__m256i _mm256_maskz_srai_epi64 (__mmask8 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_maskz_srai_epi64 (__mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsraq
__m256i _mm256_srai_epi64 (__m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_srai_epi64 (__m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ENDFOR dst[MAX:256] := 0
vpsraq
__m512i _mm512_mask_srai_epi64 (__m512i src, __mmask8 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_mask_srai_epi64 (__m512i src, __mmask8 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpsraq
__m512i _mm512_maskz_srai_epi64 (__mmask8 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_maskz_srai_epi64 (__mmask8 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsraq
__m512i _mm512_srai_epi64 (__m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_srai_epi64 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ENDFOR dst[MAX:512] := 0
vpsravw
__m128i _mm_mask_srav_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_srav_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpsravw
__m128i _mm_maskz_srav_epi16 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_srav_epi16 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsravw
__m128i _mm_srav_epi16 (__m128i a, __m128i count)

Synopsis

__m128i _mm_srav_epi16 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i]) ENDFOR dst[MAX:128] := 0
vpsravw
__m256i _mm256_mask_srav_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_mask_srav_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsravw
__m256i _mm256_maskz_srav_epi16 (__mmask16 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_maskz_srav_epi16 (__mmask16 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsravw
__m256i _mm256_srav_epi16 (__m256i a, __m256i count)

Synopsis

__m256i _mm256_srav_epi16 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i]) ENDFOR dst[MAX:256] := 0
vpsravw
__m512i _mm512_mask_srav_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_mask_srav_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsravw
__m512i _mm512_maskz_srav_epi16 (__mmask32 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_maskz_srav_epi16 (__mmask32 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsravw
__m512i _mm512_srav_epi16 (__m512i a, __m512i count)

Synopsis

__m512i _mm512_srav_epi16 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i]) ENDFOR dst[MAX:512] := 0
vpsravd
__m128i _mm_mask_srav_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_srav_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravd
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpsravd
__m128i _mm_maskz_srav_epi32 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_srav_epi32 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravd
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsravd
__m128i _mm_srav_epi32 (__m128i a, __m128i count)

Synopsis

__m128i _mm_srav_epi32 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravd xmm, xmm, xmm
CPUID Flags: AVX2

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
vpsravd
__m256i _mm256_mask_srav_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_mask_srav_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravd
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpsravd
__m256i _mm256_maskz_srav_epi32 (__mmask8 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_maskz_srav_epi32 (__mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravd
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsravd
__m256i _mm256_srav_epi32 (__m256i a, __m256i count)

Synopsis

__m256i _mm256_srav_epi32 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
vpsravd
__m512i _mm512_mask_srav_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_mask_srav_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpsravd
__m512i _mm512_maskz_srav_epi32 (__mmask16 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_maskz_srav_epi32 (__mmask16 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsravd
__m512i _mm512_srav_epi32 (__m512i a, __m512i count)

Synopsis

__m512i _mm512_srav_epi32 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ENDFOR dst[MAX:512] := 0
vpsravq
__m128i _mm_mask_srav_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_srav_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpsravq
__m128i _mm_maskz_srav_epi64 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_srav_epi64 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsravq
__m128i _mm_srav_epi64 (__m128i a, __m128i count)

Synopsis

__m128i _mm_srav_epi64 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ENDFOR dst[MAX:128] := 0
vpsravq
__m256i _mm256_mask_srav_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_mask_srav_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpsravq
__m256i _mm256_maskz_srav_epi64 (__mmask8 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_maskz_srav_epi64 (__mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsravq
__m256i _mm256_srav_epi64 (__m256i a, __m256i count)

Synopsis

__m256i _mm256_srav_epi64 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ENDFOR dst[MAX:256] := 0
vpsravq
__m512i _mm512_mask_srav_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_mask_srav_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpsravq
__m512i _mm512_maskz_srav_epi64 (__mmask8 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_maskz_srav_epi64 (__mmask8 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsravq
__m512i _mm512_srav_epi64 (__m512i a, __m512i count)

Synopsis

__m512i _mm512_srav_epi64 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ENDFOR dst[MAX:512] := 0
vpsrlw
__m128i _mm_mask_srl_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_srl_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpsrlw
__m128i _mm_maskz_srl_epi16 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_srl_epi16 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
psrlw
__m128i _mm_srl_epi16 (__m128i a, __m128i count)

Synopsis

__m128i _mm_srl_epi16 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psrlw xmm, xmm
CPUID Flags: SSE2

Description

Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell21
Ivy Bridge21
Sandy Bridge21
Westmere21
Nehalem21
vpsrlw
__m256i _mm256_mask_srl_epi16 (__m256i src, __mmask16 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_mask_srl_epi16 (__m256i src, __mmask16 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsrlw
__m256i _mm256_maskz_srl_epi16 (__mmask16 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_maskz_srl_epi16 (__mmask16 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsrlw
__m256i _mm256_srl_epi16 (__m256i a, __m128i count)

Synopsis

__m256i _mm256_srl_epi16 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw ymm, ymm, xmm
CPUID Flags: AVX2

Description

Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell4-
vpsrlw
__m512i _mm512_mask_srl_epi16 (__m512i src, __mmask32 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_mask_srl_epi16 (__m512i src, __mmask32 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsrlw
__m512i _mm512_maskz_srl_epi16 (__mmask32 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_maskz_srl_epi16 (__mmask32 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsrlw
__m512i _mm512_srl_epi16 (__m512i a, __m128i count)

Synopsis

__m512i _mm512_srl_epi16 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 IF count[63:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0]) FI ENDFOR dst[MAX:512] := 0
vpsrld
__m128i _mm_mask_srl_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_srl_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpsrld
__m128i _mm_maskz_srl_epi32 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_srl_epi32 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
psrld
__m128i _mm_srl_epi32 (__m128i a, __m128i count)

Synopsis

__m128i _mm_srl_epi32 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psrld xmm, xmm
CPUID Flags: SSE2

Description

Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Westmere2-
Nehalem2-
vpsrld
__m256i _mm256_mask_srl_epi32 (__m256i src, __mmask8 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_mask_srl_epi32 (__m256i src, __mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpsrld
__m256i _mm256_maskz_srl_epi32 (__mmask8 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_maskz_srl_epi32 (__mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsrld
__m256i _mm256_srl_epi32 (__m256i a, __m128i count)

Synopsis

__m256i _mm256_srl_epi32 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld ymm, ymm, xmm
CPUID Flags: AVX2

Description

Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell4-
vpsrld
__m512i _mm512_mask_srl_epi32 (__m512i src, __mmask16 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_mask_srl_epi32 (__m512i src, __mmask16 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpsrld
__m512i _mm512_maskz_srl_epi32 (__mmask16 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_maskz_srl_epi32 (__mmask16 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsrld
__m512i _mm512_srl_epi32 (__m512i a, __m128i count)

Synopsis

__m512i _mm512_srl_epi32 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ENDFOR dst[MAX:512] := 0
vpsrlq
__m128i _mm_mask_srl_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_srl_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpsrlq
__m128i _mm_maskz_srl_epi64 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_srl_epi64 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
psrlq
__m128i _mm_srl_epi64 (__m128i a, __m128i count)

Synopsis

__m128i _mm_srl_epi64 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psrlq xmm, xmm
CPUID Flags: SSE2

Description

Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Westmere2-
Nehalem2-
vpsrlq
__m256i _mm256_mask_srl_epi64 (__m256i src, __mmask8 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_mask_srl_epi64 (__m256i src, __mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpsrlq
__m256i _mm256_maskz_srl_epi64 (__mmask8 k, __m256i a, __m128i count)

Synopsis

__m256i _mm256_maskz_srl_epi64 (__mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsrlq
__m256i _mm256_srl_epi64 (__m256i a, __m128i count)

Synopsis

__m256i _mm256_srl_epi64 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq ymm, ymm, xmm
CPUID Flags: AVX2

Description

Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell4-
vpsrlq
__m512i _mm512_mask_srl_epi64 (__m512i src, __mmask8 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_mask_srl_epi64 (__m512i src, __mmask8 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpsrlq
__m512i _mm512_maskz_srl_epi64 (__mmask8 k, __m512i a, __m128i count)

Synopsis

__m512i _mm512_maskz_srl_epi64 (__mmask8 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsrlq
__m512i _mm512_srl_epi64 (__m512i a, __m128i count)

Synopsis

__m512i _mm512_srl_epi64 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq zmm {k}, zmm, xmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ENDFOR dst[MAX:512] := 0
vpsrlw
__m128i _mm_mask_srli_epi16 (__m128i src, __mmask8 k, __m128i a, int imm8)

Synopsis

__m128i _mm_mask_srli_epi16 (__m128i src, __mmask8 k, __m128i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpsrlw
__m128i _mm_maskz_srli_epi16 (__mmask8 k, __m128i a, int imm8)

Synopsis

__m128i _mm_maskz_srli_epi16 (__mmask8 k, __m128i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
psrlw
__m128i _mm_srli_epi16 (__m128i a, int imm8)

Synopsis

__m128i _mm_srli_epi16 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psrlw xmm, imm
CPUID Flags: SSE2

Description

Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
vpsrlw
__m256i _mm256_mask_srli_epi16 (__m256i src, __mmask16 k, __m256i a, int imm8)

Synopsis

__m256i _mm256_mask_srli_epi16 (__m256i src, __mmask16 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsrlw
__m256i _mm256_maskz_srli_epi16 (__mmask16 k, __m256i a, int imm8)

Synopsis

__m256i _mm256_maskz_srli_epi16 (__mmask16 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsrlw
__m256i _mm256_srli_epi16 (__m256i a, int imm8)

Synopsis

__m256i _mm256_srli_epi16 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlw ymm, ymm, imm
CPUID Flags: AVX2

Description

Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsrlw
__m512i _mm512_mask_srli_epi16 (__m512i src, __mmask32 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_mask_srli_epi16 (__m512i src, __mmask32 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0]) FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsrlw
__m512i _mm512_maskz_srli_epi16 (__mmask32 k, __m512i a, int imm8)

Synopsis

__m512i _mm512_maskz_srli_epi16 (__mmask32 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0]) FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsrlw
__m512i _mm512_srli_epi16 (__m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_srli_epi16 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 IF imm8[7:0] > 15 dst[i+15:i] := 0 ELSE dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0]) FI ENDFOR dst[MAX:512] := 0
vpsrld
__m128i _mm_mask_srli_epi32 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_mask_srli_epi32 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpsrld
__m128i _mm_maskz_srli_epi32 (__mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_maskz_srli_epi32 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
psrld
__m128i _mm_srli_epi32 (__m128i a, int imm8)

Synopsis

__m128i _mm_srli_epi32 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psrld xmm, imm
CPUID Flags: SSE2

Description

Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vpsrld
__m256i _mm256_mask_srli_epi32 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_mask_srli_epi32 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpsrld
__m256i _mm256_maskz_srli_epi32 (__mmask8 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_maskz_srli_epi32 (__mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsrld
__m256i _mm256_srli_epi32 (__m256i a, int imm8)

Synopsis

__m256i _mm256_srli_epi32 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsrld ymm, ymm, imm
CPUID Flags: AVX2

Description

Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsrld
__m512i _mm512_mask_srli_epi32 (__m512i src, __mmask16 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_mask_srli_epi32 (__m512i src, __mmask16 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpsrld
__m512i _mm512_maskz_srli_epi32 (__mmask16 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_maskz_srli_epi32 (__mmask16 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsrld
__m512i _mm512_srli_epi32 (__m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_srli_epi32 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ENDFOR dst[MAX:512] := 0
vpsrlq
__m128i _mm_mask_srli_epi64 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_mask_srli_epi64 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpsrlq
__m128i _mm_maskz_srli_epi64 (__mmask8 k, __m128i a, unsigned int imm8)

Synopsis

__m128i _mm_maskz_srli_epi64 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
psrlq
__m128i _mm_srli_epi64 (__m128i a, int imm8)

Synopsis

__m128i _mm_srli_epi64 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psrlq xmm, imm
CPUID Flags: SSE2

Description

Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vpsrlq
__m256i _mm256_mask_srli_epi64 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_mask_srli_epi64 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpsrlq
__m256i _mm256_maskz_srli_epi64 (__mmask8 k, __m256i a, unsigned int imm8)

Synopsis

__m256i _mm256_maskz_srli_epi64 (__mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsrlq
__m256i _mm256_srli_epi64 (__m256i a, int imm8)

Synopsis

__m256i _mm256_srli_epi64 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlq ymm, ymm, imm
CPUID Flags: AVX2

Description

Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsrlq
__m512i _mm512_mask_srli_epi64 (__m512i src, __mmask8 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_mask_srli_epi64 (__m512i src, __mmask8 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpsrlq
__m512i _mm512_maskz_srli_epi64 (__mmask8 k, __m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_maskz_srli_epi64 (__mmask8 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsrlq
__m512i _mm512_srli_epi64 (__m512i a, unsigned int imm8)

Synopsis

__m512i _mm512_srli_epi64 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq zmm {k}, zmm, imm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ENDFOR dst[MAX:512] := 0
psrldq
__m128i _mm_srli_si128 (__m128i a, int imm8)

Synopsis

__m128i _mm_srli_si128 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psrldq xmm, imm
CPUID Flags: SSE2

Description

Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.

Operation

tmp := imm8[7:0] IF tmp > 15 tmp := 16 FI dst[127:0] := a[127:0] >> (tmp*8)

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpsrldq
__m256i _mm256_srli_si256 (__m256i a, const int imm8)

Synopsis

__m256i _mm256_srli_si256 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpsrldq ymm, ymm, imm
CPUID Flags: AVX2

Description

Shift 128-bit lanes in a right by imm8 bytes while shifting in zeros, and store the results in dst.

Operation

tmp := imm8[7:0] IF tmp > 15 tmp := 16 FI dst[127:0] := a[127:0] >> (tmp*8) dst[255:128] := a[255:128] >> (tmp*8) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsrlvw
__m128i _mm_mask_srlv_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_srlv_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpsrlvw
__m128i _mm_maskz_srlv_epi16 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_srlv_epi16 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsrlvw
__m128i _mm_srlv_epi16 (__m128i a, __m128i count)

Synopsis

__m128i _mm_srlv_epi16 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsrlvw
__m256i _mm256_mask_srlv_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_mask_srlv_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+63:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsrlvw
__m256i _mm256_maskz_srlv_epi16 (__mmask16 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_maskz_srlv_epi16 (__mmask16 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsrlvw
__m256i _mm256_srlv_epi16 (__m256i a, __m256i count)

Synopsis

__m256i _mm256_srlv_epi16 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512VL + AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsrlvw
__m512i _mm512_mask_srlv_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_mask_srlv_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsrlvw
__m512i _mm512_maskz_srlv_epi16 (__mmask32 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_maskz_srlv_epi16 (__mmask32 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsrlvw
__m512i _mm512_srlv_epi16 (__m512i a, __m512i count)

Synopsis

__m512i _mm512_srlv_epi16 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512BW

Description

Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsrlvd
__m128i _mm_mask_srlv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_srlv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvd
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpsrlvd
__m128i _mm_maskz_srlv_epi32 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_srlv_epi32 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvd
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsrlvd
__m128i _mm_srlv_epi32 (__m128i a, __m128i count)

Synopsis

__m128i _mm_srlv_epi32 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvd xmm, xmm, xmm
CPUID Flags: AVX2

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
vpsrlvd
__m256i _mm256_mask_srlv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_mask_srlv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvd
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpsrlvd
__m256i _mm256_maskz_srlv_epi32 (__mmask8 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_maskz_srlv_epi32 (__mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvd
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsrlvd
__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)

Synopsis

__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell2-
vpsrlvd
__m512i _mm512_mask_srlv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_mask_srlv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpsrlvd
__m512i _mm512_maskz_srlv_epi32 (__mmask16 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_maskz_srlv_epi32 (__mmask16 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsrlvd
__m512i _mm512_srlv_epi32 (__m512i a, __m512i count)

Synopsis

__m512i _mm512_srlv_epi32 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ENDFOR dst[MAX:512] := 0
vpsrlvq
__m128i _mm_mask_srlv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_mask_srlv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpsrlvq
__m128i _mm_maskz_srlv_epi64 (__mmask8 k, __m128i a, __m128i count)

Synopsis

__m128i _mm_maskz_srlv_epi64 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpsrlvq
__m128i _mm_srlv_epi64 (__m128i a, __m128i count)

Synopsis

__m128i _mm_srlv_epi64 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvq xmm, xmm, xmm
CPUID Flags: AVX2

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ENDFOR dst[MAX:128] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsrlvq
__m256i _mm256_mask_srlv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_mask_srlv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpsrlvq
__m256i _mm256_maskz_srlv_epi64 (__mmask8 k, __m256i a, __m256i count)

Synopsis

__m256i _mm256_maskz_srlv_epi64 (__mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvq
CPUID Flags: AVX512VL + AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsrlvq
__m256i _mm256_srlv_epi64 (__m256i a, __m256i count)

Synopsis

__m256i _mm256_srlv_epi64 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsrlvq
__m512i _mm512_mask_srlv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_mask_srlv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpsrlvq
__m512i _mm512_maskz_srlv_epi64 (__mmask8 k, __m512i a, __m512i count)

Synopsis

__m512i _mm512_maskz_srlv_epi64 (__mmask8 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsrlvq
__m512i _mm512_srlv_epi64 (__m512i a, __m512i count)

Synopsis

__m512i _mm512_srlv_epi64 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ENDFOR dst[MAX:512] := 0
vmovdqa32
void _mm_mask_store_epi32 (void* mem_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_store_epi32 (void* mem_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F

Description

Store packed 32-bit integers from a into memory using writemask k. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR
vmovdqa32
void _mm256_mask_store_epi32 (void* mem_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_store_epi32 (void* mem_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F

Description

Store packed 32-bit integers from a into memory using writemask k. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR
vmovdqa32
void _mm512_mask_store_epi32 (void* mem_addr, __mmask16 k, __m512i a)

Synopsis

void _mm512_mask_store_epi32 (void* mem_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa32 m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Store packed 32-bit integers from a into memory using writemask k. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR
vmovdqa32
void _mm512_store_epi32 (void* mem_addr, __m512i a)

Synopsis

void _mm512_store_epi32 (void* mem_addr, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa32 m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+511:mem_addr] := a[511:0]
vmovdqa64
void _mm_mask_store_epi64 (void* mem_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_store_epi64 (void* mem_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F

Description

Store packed 64-bit integers from a into memory using writemask k. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR
vmovdqa64
void _mm256_mask_store_epi64 (void* mem_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_store_epi64 (void* mem_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F

Description

Store packed 64-bit integers from a into memory using writemask k. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR
vmovdqa64
void _mm512_mask_store_epi64 (void* mem_addr, __mmask8 k, __m512i a)

Synopsis

void _mm512_mask_store_epi64 (void* mem_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa64 m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Store packed 64-bit integers from a into memory using writemask k. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR
vmovdqa64
void _mm512_store_epi64 (void* mem_addr, __m512i a)

Synopsis

void _mm512_store_epi64 (void* mem_addr, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa64 m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+511:mem_addr] := a[511:0]
vmovapd
void _mm_mask_store_pd (void* mem_addr, __mmask8 k, __m128d a)

Synopsis

void _mm_mask_store_pd (void* mem_addr, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F

Description

Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR
movapd
void _mm_store_pd (double* mem_addr, __m128d a)

Synopsis

void _mm_store_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
Instruction: movapd m128, xmm
CPUID Flags: SSE2

Description

Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+127:mem_addr] := a[127:0]
vmovapd
void _mm256_mask_store_pd (void* mem_addr, __mmask8 k, __m256d a)

Synopsis

void _mm256_mask_store_pd (void* mem_addr, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F

Description

Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR
vmovapd
void _mm256_store_pd (double * mem_addr, __m256d a)

Synopsis

void _mm256_store_pd (double * mem_addr, __m256d a)
#include "immintrin.h"
Instruction: vmovapd m256, ymm
CPUID Flags: AVX

Description

Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+255:mem_addr] := a[255:0]
vmovapd
void _mm512_mask_store_pd (void* mem_addr, __mmask8 k, __m512d a)

Synopsis

void _mm512_mask_store_pd (void* mem_addr, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vmovapd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR
vmovapd
void _mm512_store_pd (void* mem_addr, __m512d a)

Synopsis

void _mm512_store_pd (void* mem_addr, __m512d a)
#include "immintrin.h"
Instruction: vmovapd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+511:mem_addr] := a[511:0]
...
void _mm_store_pd1 (double* mem_addr, __m128d a)

Synopsis

void _mm_store_pd1 (double* mem_addr, __m128d a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Store the lower double-precision (64-bit) floating-point element from a into 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+63:mem_addr] := a[63:0] MEM[mem_addr+127:mem_addr+64] := a[63:0]
vmovaps
void _mm_mask_store_ps (void* mem_addr, __mmask8 k, __m128 a)

Synopsis

void _mm_mask_store_ps (void* mem_addr, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F

Description

Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR
movaps
void _mm_store_ps (float* mem_addr, __m128 a)

Synopsis

void _mm_store_ps (float* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movaps m128, xmm
CPUID Flags: SSE

Description

Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+127:mem_addr] := a[127:0]
vmovaps
void _mm256_mask_store_ps (void* mem_addr, __mmask8 k, __m256 a)

Synopsis

void _mm256_mask_store_ps (void* mem_addr, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F

Description

Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR
vmovaps
void _mm256_store_ps (float * mem_addr, __m256 a)

Synopsis

void _mm256_store_ps (float * mem_addr, __m256 a)
#include "immintrin.h"
Instruction: vmovaps m256, ymm
CPUID Flags: AVX

Description

Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+255:mem_addr] := a[255:0]
vmovaps
void _mm512_mask_store_ps (void* mem_addr, __mmask16 k, __m512 a)

Synopsis

void _mm512_mask_store_ps (void* mem_addr, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovaps m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR
vmovaps
void _mm512_store_ps (void* mem_addr, __m512 a)

Synopsis

void _mm512_store_ps (void* mem_addr, __m512 a)
#include "immintrin.h"
Instruction: vmovaps m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+511:mem_addr] := a[511:0]
...
void _mm_store_ps1 (float* mem_addr, __m128 a)

Synopsis

void _mm_store_ps1 (float* mem_addr, __m128 a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Store the lower single-precision (32-bit) floating-point element from a into 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+31:mem_addr] := a[31:0] MEM[mem_addr+63:mem_addr+32] := a[31:0] MEM[mem_addr+95:mem_addr+64] := a[31:0] MEM[mem_addr+127:mem_addr+96] := a[31:0]
vmovsd
void _mm_mask_store_sd (double* mem_addr, __mmask8 k, __m128d a)

Synopsis

void _mm_mask_store_sd (double* mem_addr, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovsd m64 {k}, xmm
CPUID Flags: AVX512F

Description

Store the lower double-precision (64-bit) floating-point element from a into memory using writemask k. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

IF k[0] MEM[mem_addr+63:mem_addr] := a[63:0] FI
movsd
void _mm_store_sd (double* mem_addr, __m128d a)

Synopsis

void _mm_store_sd (double* mem_addr, __m128d a)
#include "emmintrin.h"
Instruction: movsd m64, xmm
CPUID Flags: SSE2

Description

Store the lower double-precision (64-bit) floating-point element from a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+63:mem_addr] := a[63:0]
movdqa
void _mm_store_si128 (__m128i* mem_addr, __m128i a)

Synopsis

void _mm_store_si128 (__m128i* mem_addr, __m128i a)
#include "emmintrin.h"
Instruction: movdqa m128, xmm
CPUID Flags: SSE2

Description

Store 128-bits of integer data from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+127:mem_addr] := a[127:0]
vmovdqa
void _mm256_store_si256 (__m256i * mem_addr, __m256i a)

Synopsis

void _mm256_store_si256 (__m256i * mem_addr, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa m256, ymm
CPUID Flags: AVX

Description

Store 256-bits of integer data from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+255:mem_addr] := a[255:0]
vmovdqa32
void _mm512_store_si512 (void* mem_addr, __m512i a)

Synopsis

void _mm512_store_si512 (void* mem_addr, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa32 m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+511:mem_addr] := a[511:0]
vmovss
void _mm_mask_store_ss (float* mem_addr, __mmask8 k, __m128 a)

Synopsis

void _mm_mask_store_ss (float* mem_addr, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovss m32 {k}, xmm
CPUID Flags: AVX512F

Description

Store the lower single-precision (32-bit) floating-point element from a into memory using writemask k. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

IF k[0] MEM[mem_addr+31:mem_addr] := a[31:0] FI
movss
void _mm_store_ss (float* mem_addr, __m128 a)

Synopsis

void _mm_store_ss (float* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movss m32, xmm
CPUID Flags: SSE

Description

Store the lower single-precision (32-bit) floating-point element from a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+31:mem_addr] := a[31:0]
...
void _mm_store1_pd (double* mem_addr, __m128d a)

Synopsis

void _mm_store1_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Store the lower double-precision (64-bit) floating-point element from a into 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+63:mem_addr] := a[63:0] MEM[mem_addr+127:mem_addr+64] := a[63:0]
...
void _mm_store1_ps (float* mem_addr, __m128 a)

Synopsis

void _mm_store1_ps (float* mem_addr, __m128 a)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Store the lower single-precision (32-bit) floating-point element from a into 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+31:mem_addr] := a[31:0] MEM[mem_addr+63:mem_addr+32] := a[31:0] MEM[mem_addr+95:mem_addr+64] := a[31:0] MEM[mem_addr+127:mem_addr+96] := a[31:0]
...
void _storebe_i16 (void * ptr, short data)

Synopsis

void _storebe_i16 (void * ptr, short data)
#include "immintrin.h"

Description

Stores word-sized (16-bit) data to address ptr in big-endian format.

Operation

addr := MEM[ptr] FOR j := 0 to 1 i := j*8 addr[i+7:i] := data[15-i:15-i-7] ENDFOR
...
void _storebe_i32 (void * ptr, int data)

Synopsis

void _storebe_i32 (void * ptr, int data)
#include "immintrin.h"

Description

Stores double word-sized (32-bit) data to address ptr in big-endian format.

Operation

addr := MEM[ptr] FOR j := 0 to 4 i := j*8 addr[i+7:i] := data[31-i:31-i-7] ENDFOR
...
void _storebe_i64 (void * ptr, __int64 data)

Synopsis

void _storebe_i64 (void * ptr, __int64 data)
#include "immintrin.h"

Description

Stores quad word-sized (64-bit) data to address ptr in big-endian format.

Operation

addr := MEM[ptr] FOR j := 0 to 7 i := j*8 addr[i+7:i] := data[63-i:63-i-7] ENDFOR
movhpd
void _mm_storeh_pd (double* mem_addr, __m128d a)

Synopsis

void _mm_storeh_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
Instruction: movhpd m64, xmm
CPUID Flags: SSE2

Description

Store the upper double-precision (64-bit) floating-point element from a into memory.

Operation

MEM[mem_addr+63:mem_addr] := a[127:64]
movhps
void _mm_storeh_pi (__m64* mem_addr, __m128 a)

Synopsis

void _mm_storeh_pi (__m64* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movhps m64, xmm
CPUID Flags: SSE

Description

Store the upper 2 single-precision (32-bit) floating-point elements from a into memory.

Operation

MEM[mem_addr+31:mem_addr] := a[95:64] MEM[mem_addr+63:mem_addr+32] := a[127:96]
movq
void _mm_storel_epi64 (__m128i* mem_addr, __m128i a)

Synopsis

void _mm_storel_epi64 (__m128i* mem_addr, __m128i a)
#include "emmintrin.h"
Instruction: movq m64, xmm
CPUID Flags: SSE2

Description

Store 64-bit integer from the first element of a into memory.

Operation

MEM[mem_addr+63:mem_addr] := a[63:0]
movlpd
void _mm_storel_pd (double* mem_addr, __m128d a)

Synopsis

void _mm_storel_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
Instruction: movlpd m64, xmm
CPUID Flags: SSE2

Description

Store the lower double-precision (64-bit) floating-point element from a into memory.

Operation

MEM[mem_addr+63:mem_addr] := a[63:0]
movlps
void _mm_storel_pi (__m64* mem_addr, __m128 a)

Synopsis

void _mm_storel_pi (__m64* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movlps m64, xmm
CPUID Flags: SSE

Description

Store the lower 2 single-precision (32-bit) floating-point elements from a into memory.

Operation

MEM[mem_addr+31:mem_addr] := a[31:0] MEM[mem_addr+63:mem_addr+32] := a[63:32]
vmovnrapd
void _mm512_storenr_pd (void * mt, __m512d v)

Synopsis

void _mm512_storenr_pd (void * mt, __m512d v)
#include "immintrin.h"
Instruction: vmovnrapd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed double-precision (64-bit) floating-point elements from v to memory address mt with a no-read hint to the processor.

Operation

addr := MEM[mt] FOR j := 0 to 7 i := j*64 addr[i+63:i] := v[i+63:i] ENDFOR
vmovnraps
void _mm512_storenr_ps (void * mt, __m512 v)

Synopsis

void _mm512_storenr_ps (void * mt, __m512 v)
#include "immintrin.h"
Instruction: vmovnraps m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed single-precision (32-bit) floating-point elements from v to memory address mt with a no-read hint to the processor.

Operation

addr := MEM[mt] FOR j := 0 to 15 i := j*32 addr[i+31:i] := v[i+31:i] ENDFOR
vmovnrngoapd
void _mm512_storenrngo_pd (void * mt, __m512d v)

Synopsis

void _mm512_storenrngo_pd (void * mt, __m512d v)
#include "immintrin.h"
Instruction: vmovnrngoapd m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed double-precision (64-bit) floating-point elements from v to memory address mt with a no-read hint and using a weakly-ordered memory consistency model (stores performed with this function are not globally ordered, and subsequent stores from the same thread can be observed before them).

Operation

addr := MEM[mt] FOR j := 0 to 7 i := j*64 addr[i+63:i] := v[i+63:i] ENDFOR
vmovnrngoaps
void _mm512_storenrngo_ps (void * mt, __m512 v)

Synopsis

void _mm512_storenrngo_ps (void * mt, __m512 v)
#include "immintrin.h"
Instruction: vmovnrngoaps m512 {k}, zmm
CPUID Flags: KNCNI

Description

Stores packed single-precision (32-bit) floating-point elements from v to memory address mt with a no-read hint and using a weakly-ordered memory consistency model (stores performed with this function are not globally ordered, and subsequent stores from the same thread can be observed before them).

Operation

addr := MEM[mt] FOR j := 0 to 15 i := j*32 addr[i+31:i] := v[i+31:i] ENDFOR
...
void _mm_storer_pd (double* mem_addr, __m128d a)

Synopsis

void _mm_storer_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
CPUID Flags: SSE2

Description

Store 2 double-precision (64-bit) floating-point elements from a into memory in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+63:mem_addr] := a[127:64] MEM[mem_addr+127:mem_addr+64] := a[63:0]
...
void _mm_storer_ps (float* mem_addr, __m128 a)

Synopsis

void _mm_storer_ps (float* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movups m128, xmm
CPUID Flags: SSE

Description

Store 4 single-precision (32-bit) floating-point elements from a into memory in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+31:mem_addr] := a[127:96] MEM[mem_addr+63:mem_addr+32] := a[95:64] MEM[mem_addr+95:mem_addr+64] := a[63:32] MEM[mem_addr+127:mem_addr+96] := a[31:0]
vmovdqu16
void _mm_mask_storeu_epi16 (void* mem_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_storeu_epi16 (void* mem_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW

Description

Store packed 16-bit integers from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*16 IF k[j] MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] FI ENDFOR dst[MAX:128] := 0
vmovdqu16
void _mm256_mask_storeu_epi16 (void* mem_addr, __mmask16 k, __m256i a)

Synopsis

void _mm256_mask_storeu_epi16 (void* mem_addr, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW

Description

Store packed 16-bit integers from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*16 IF k[j] MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] FI ENDFOR dst[MAX:256] := 0
vmovdqu16
void _mm512_mask_storeu_epi16 (void* mem_addr, __mmask32 k, __m512i a)

Synopsis

void _mm512_mask_storeu_epi16 (void* mem_addr, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512BW

Description

Store packed 16-bit integers from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 31 i := j*16 IF k[j] MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] FI ENDFOR dst[MAX:512] := 0
vmovdqu32
void _mm_mask_storeu_epi32 (void* mem_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_storeu_epi32 (void* mem_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu32
CPUID Flags: AVX512VL + AVX512F

Description

Store packed 32-bit integers from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR
vmovdqu32
void _mm256_mask_storeu_epi32 (void* mem_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_storeu_epi32 (void* mem_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu32
CPUID Flags: AVX512VL + AVX512F

Description

Store packed 32-bit integers from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR
vmovdqu32
void _mm512_mask_storeu_epi32 (void* mem_addr, __mmask16 k, __m512i a)

Synopsis

void _mm512_mask_storeu_epi32 (void* mem_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu32 m512 {k}, zmm
CPUID Flags: AVX512F

Description

Store packed 32-bit integers from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR
vmovdqu64
void _mm_mask_storeu_epi64 (void* mem_addr, __mmask8 k, __m128i a)

Synopsis

void _mm_mask_storeu_epi64 (void* mem_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu64
CPUID Flags: AVX512VL + AVX512F

Description

Store packed 64-bit integers from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR
vmovdqu64
void _mm256_mask_storeu_epi64 (void* mem_addr, __mmask8 k, __m256i a)

Synopsis

void _mm256_mask_storeu_epi64 (void* mem_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu64
CPUID Flags: AVX512VL + AVX512F

Description

Store packed 64-bit integers from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR
vmovdqu64
void _mm512_mask_storeu_epi64 (void* mem_addr, __mmask8 k, __m512i a)

Synopsis

void _mm512_mask_storeu_epi64 (void* mem_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu64 m512 {k}, zmm
CPUID Flags: AVX512F

Description

Store packed 64-bit integers from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR
vmovdqu8
void _mm_mask_storeu_epi8 (void* mem_addr, __mmask16 k, __m128i a)

Synopsis

void _mm_mask_storeu_epi8 (void* mem_addr, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW

Description

Store packed 8-bit integers from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*8 IF k[j] MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] FI ENDFOR
vmovdqu8
void _mm256_mask_storeu_epi8 (void* mem_addr, __mmask32 k, __m256i a)

Synopsis

void _mm256_mask_storeu_epi8 (void* mem_addr, __mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW

Description

Store packed 8-bit integers from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 31 i := j*8 IF k[j] MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] FI ENDFOR
vmovdqu8
void _mm512_mask_storeu_epi8 (void* mem_addr, __mmask64 k, __m512i a)

Synopsis

void _mm512_mask_storeu_epi8 (void* mem_addr, __mmask64 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512BW

Description

Store packed 8-bit integers from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 63 i := j*8 IF k[j] MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] FI ENDFOR
vmovupd
void _mm_mask_storeu_pd (void* mem_addr, __mmask8 k, __m128d a)

Synopsis

void _mm_mask_storeu_pd (void* mem_addr, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovupd
CPUID Flags: AVX512VL + AVX512F

Description

Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 1 i := j*64 IF k[j] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR
movupd
void _mm_storeu_pd (double* mem_addr, __m128d a)

Synopsis

void _mm_storeu_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
Instruction: movupd m128, xmm
CPUID Flags: SSE2

Description

Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+127:mem_addr] := a[127:0]
vmovupd
void _mm256_mask_storeu_pd (void* mem_addr, __mmask8 k, __m256d a)

Synopsis

void _mm256_mask_storeu_pd (void* mem_addr, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vmovupd
CPUID Flags: AVX512VL + AVX512F

Description

Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 3 i := j*64 IF k[j] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR
vmovupd
void _mm256_storeu_pd (double * mem_addr, __m256d a)

Synopsis

void _mm256_storeu_pd (double * mem_addr, __m256d a)
#include "immintrin.h"
Instruction: vmovupd m256, ymm
CPUID Flags: AVX

Description

Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+255:mem_addr] := a[255:0]
vmovupd
void _mm512_mask_storeu_pd (void* mem_addr, __mmask8 k, __m512d a)

Synopsis

void _mm512_mask_storeu_pd (void* mem_addr, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vmovupd m512 {k}, zmm
CPUID Flags: AVX512F

Description

Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*64 IF k[j] MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR
vmovupd
void _mm512_storeu_pd (void* mem_addr, __m512d a)

Synopsis

void _mm512_storeu_pd (void* mem_addr, __m512d a)
#include "immintrin.h"
Instruction: vmovupd m512 {k}, zmm
CPUID Flags: AVX512F

Description

Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+511:mem_addr] := a[511:0]
vmovups
void _mm_mask_storeu_ps (void* mem_addr, __mmask8 k, __m128 a)

Synopsis

void _mm_mask_storeu_ps (void* mem_addr, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovups
CPUID Flags: AVX512VL + AVX512F

Description

Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 3 i := j*32 IF k[j] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR
movups
void _mm_storeu_ps (float* mem_addr, __m128 a)

Synopsis

void _mm_storeu_ps (float* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movups m128, xmm
CPUID Flags: SSE

Description

Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+127:mem_addr] := a[127:0]
vmovups
void _mm256_mask_storeu_ps (void* mem_addr, __mmask8 k, __m256 a)

Synopsis

void _mm256_mask_storeu_ps (void* mem_addr, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovups
CPUID Flags: AVX512VL + AVX512F

Description

Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 7 i := j*32 IF k[j] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR
vmovups
void _mm256_storeu_ps (float * mem_addr, __m256 a)

Synopsis

void _mm256_storeu_ps (float * mem_addr, __m256 a)
#include "immintrin.h"
Instruction: vmovups m256, ymm
CPUID Flags: AVX

Description

Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+255:mem_addr] := a[255:0]
vmovups
void _mm512_mask_storeu_ps (void* mem_addr, __mmask16 k, __m512 a)

Synopsis

void _mm512_mask_storeu_ps (void* mem_addr, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovups m512 {k}, zmm
CPUID Flags: AVX512F

Description

Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. mem_addr does not need to be aligned on any particular boundary.

Operation

FOR j := 0 to 15 i := j*32 IF k[j] MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR
vmovups
void _mm512_storeu_ps (void* mem_addr, __m512 a)

Synopsis

void _mm512_storeu_ps (void* mem_addr, __m512 a)
#include "immintrin.h"
Instruction: vmovups m512 {k}, zmm
CPUID Flags: AVX512F

Description

Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+511:mem_addr] := a[511:0]
movdqu
void _mm_storeu_si128 (__m128i* mem_addr, __m128i a)

Synopsis

void _mm_storeu_si128 (__m128i* mem_addr, __m128i a)
#include "emmintrin.h"
Instruction: movdqu m128, xmm
CPUID Flags: SSE2

Description

Store 128-bits of integer data from a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+127:mem_addr] := a[127:0]
...
void _mm_storeu_si16 (void* mem_addr, __m128i a)

Synopsis

void _mm_storeu_si16 (void* mem_addr, __m128i a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Store 16-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+15:mem_addr] := a[15:0]
movd+movw
void _mm_storeu_si16 (void* mem_addr, __m128i a)

Synopsis

void _mm_storeu_si16 (void* mem_addr, __m128i a)
#include "immintrin.h"
Instruction: movd+movw

Description

Store 16-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+15:mem_addr] := a[15:0]
vmovdqu
void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)

Synopsis

void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu m256, ymm
CPUID Flags: AVX

Description

Store 256-bits of integer data from a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+255:mem_addr] := a[255:0]
movd
void _mm_storeu_si32 (void* mem_addr, __m128i a)

Synopsis

void _mm_storeu_si32 (void* mem_addr, __m128i a)
#include "immintrin.h"
Instruction: movd

Description

Store 32-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+31:mem_addr] := a[31:0]
movd
void _mm_storeu_si32 (void* mem_addr, __m128i a)

Synopsis

void _mm_storeu_si32 (void* mem_addr, __m128i a)
#include "immintrin.h"
Instruction: movd m32, xmm
CPUID Flags: SSE

Description

Store 32-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+31:mem_addr] := a[31:0]
vmovdqu32
void _mm512_storeu_si512 (void* mem_addr, __m512i a)

Synopsis

void _mm512_storeu_si512 (void* mem_addr, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu32 m512 {k}, zmm
CPUID Flags: AVX512F

Description

Store 512-bits of integer data from a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+511:mem_addr] := a[511:0]
movq
void _mm_storeu_si64 (void* mem_addr, __m128i a)

Synopsis

void _mm_storeu_si64 (void* mem_addr, __m128i a)
#include "immintrin.h"
Instruction: movq m64, xmm
CPUID Flags: SSE

Description

Store 64-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+63:mem_addr] := a[63:0]
movq
void _mm_storeu_si64 (void* mem_addr, __m128i a)

Synopsis

void _mm_storeu_si64 (void* mem_addr, __m128i a)
#include "immintrin.h"
Instruction: movq

Description

Store 64-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.

Operation

MEM[mem_addr+63:mem_addr] := a[63:0]
...
void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a)

Synopsis

void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory two different 128-bit locations. hiaddr and loaddr do not need to be aligned on any particular boundary.

Operation

MEM[loaddr+127:loaddr] := a[127:0] MEM[hiaddr+127:hiaddr] := a[255:128]
...
void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a)

Synopsis

void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory two different 128-bit locations. hiaddr and loaddr do not need to be aligned on any particular boundary.

Operation

MEM[loaddr+127:loaddr] := a[127:0] MEM[hiaddr+127:hiaddr] := a[255:128]
...
void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a)

Synopsis

void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Store the high and low 128-bit halves (each composed of integer data) from a into memory two different 128-bit locations. hiaddr and loaddr do not need to be aligned on any particular boundary.

Operation

MEM[loaddr+127:loaddr] := a[127:0] MEM[hiaddr+127:hiaddr] := a[255:128]
movntdqa
__m128i _mm_stream_load_si128 (__m128i* mem_addr)

Synopsis

__m128i _mm_stream_load_si128 (__m128i* mem_addr)
#include "smmintrin.h"
Instruction: movntdqa xmm, m128
CPUID Flags: SSE4.1

Description

Load 128-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

dst[127:0] := MEM[mem_addr+127:mem_addr]
vmovntdqa
__m256i _mm256_stream_load_si256 (__m256i const* mem_addr)

Synopsis

__m256i _mm256_stream_load_si256 (__m256i const* mem_addr)
#include "immintrin.h"
Instruction: vmovntdqa ymm, m256
CPUID Flags: AVX2

Description

Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0
vmovntdqa
__m512i _mm512_stream_load_si512 (void const* mem_addr)

Synopsis

__m512i _mm512_stream_load_si512 (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovntdqa zmm, m512
CPUID Flags: AVX512F

Description

Load 512-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0
movntpd
void _mm_stream_pd (double* mem_addr, __m128d a)

Synopsis

void _mm_stream_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
Instruction: movntpd m128, xmm
CPUID Flags: SSE2

Description

Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+127:mem_addr] := a[127:0]
vmovntpd
void _mm256_stream_pd (double * mem_addr, __m256d a)

Synopsis

void _mm256_stream_pd (double * mem_addr, __m256d a)
#include "immintrin.h"
Instruction: vmovntpd m256, ymm
CPUID Flags: AVX

Description

Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+255:mem_addr] := a[255:0]
vmovntpd
void _mm512_stream_pd (void* mem_addr, __m512d a)

Synopsis

void _mm512_stream_pd (void* mem_addr, __m512d a)
#include "immintrin.h"
Instruction: vmovntpd m512, zmm
CPUID Flags: AVX512F

Description

Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+511:mem_addr] := a[511:0]
movntq
void _mm_stream_pi (__m64* mem_addr, __m64 a)

Synopsis

void _mm_stream_pi (__m64* mem_addr, __m64 a)
#include "xmmintrin.h"
Instruction: movntq m64, mm
CPUID Flags: SSE

Description

Store 64-bits of integer data from a into memory using a non-temporal memory hint.

Operation

MEM[mem_addr+63:mem_addr] := a[63:0]
movntps
void _mm_stream_ps (float* mem_addr, __m128 a)

Synopsis

void _mm_stream_ps (float* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movntps m128, xmm
CPUID Flags: SSE

Description

Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+127:mem_addr] := a[127:0]
vmovntps
void _mm256_stream_ps (float * mem_addr, __m256 a)

Synopsis

void _mm256_stream_ps (float * mem_addr, __m256 a)
#include "immintrin.h"
Instruction: vmovntps m256, ymm
CPUID Flags: AVX

Description

Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+255:mem_addr] := a[255:0]
vmovntps
void _mm512_stream_ps (void* mem_addr, __m512 a)

Synopsis

void _mm512_stream_ps (void* mem_addr, __m512 a)
#include "immintrin.h"
Instruction: vmovntps m512, zmm
CPUID Flags: AVX512F

Description

Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+511:mem_addr] := a[511:0]
movntdq
void _mm_stream_si128 (__m128i* mem_addr, __m128i a)

Synopsis

void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
#include "emmintrin.h"
Instruction: movntdq m128, xmm
CPUID Flags: SSE2

Description

Store 128-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+127:mem_addr] := a[127:0]
vmovntdq
void _mm256_stream_si256 (__m256i * mem_addr, __m256i a)

Synopsis

void _mm256_stream_si256 (__m256i * mem_addr, __m256i a)
#include "immintrin.h"
Instruction: vmovntdq m256, ymm
CPUID Flags: AVX

Description

Store 256-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+255:mem_addr] := a[255:0]
movnti
void _mm_stream_si32 (int* mem_addr, int a)

Synopsis

void _mm_stream_si32 (int* mem_addr, int a)
#include "emmintrin.h"
Instruction: movnti m32, r32
CPUID Flags: SSE2

Description

Store 32-bit integer a into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address mem_addr is already in the cache, the cache will be updated.

Operation

MEM[mem_addr+31:mem_addr] := a[31:0]
vmovntdqa
void _mm512_stream_si512 (void* mem_addr, __m512i a)

Synopsis

void _mm512_stream_si512 (void* mem_addr, __m512i a)
#include "immintrin.h"
Instruction: vmovntdqa zmm, m512
CPUID Flags: AVX512F

Description

Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.

Operation

MEM[mem_addr+511:mem_addr] := a[511:0]
movnti
void _mm_stream_si64 (__int64* mem_addr, __int64 a)

Synopsis

void _mm_stream_si64 (__int64* mem_addr, __int64 a)
#include "emmintrin.h"
Instruction: movnti m64, r64
CPUID Flags: SSE2

Description

Store 64-bit integer a into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address mem_addr is already in the cache, the cache will be updated.

Operation

MEM[mem_addr+63:mem_addr] := a[63:0]
vpsubw
__m128i _mm_mask_sub_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_sub_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] - b[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpsubw
__m128i _mm_maskz_sub_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_sub_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] - b[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
psubw
__m128i _mm_sub_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sub_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubw xmm, xmm
CPUID Flags: SSE2

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := a[i+15:i] - b[i+15:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell1-
Ivy Bridge1-
Sandy Bridge1-
Westmere1-
Nehalem1-
vpsubw
__m256i _mm256_mask_sub_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_sub_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] - b[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsubw
__m256i _mm256_maskz_sub_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_sub_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] - b[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsubw
__m256i _mm256_sub_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_sub_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := a[i+15:i] - b[i+15:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsubw
__m512i _mm512_mask_sub_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_sub_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] - b[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsubw
__m512i _mm512_maskz_sub_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_sub_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := a[i+15:i] - b[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsubw
__m512i _mm512_sub_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_sub_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 dst[i+15:i] := a[i+15:i] - b[i+15:i] ENDFOR dst[MAX:512] := 0
vpsubd
__m128i _mm_mask_sub_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_sub_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubd
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpsubd
__m128i _mm_maskz_sub_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_sub_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubd
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
psubd
__m128i _mm_sub_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sub_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubd xmm, xmm
CPUID Flags: SSE2

Description

Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpsubd
__m256i _mm256_mask_sub_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_sub_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubd
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpsubd
__m256i _mm256_maskz_sub_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_sub_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubd
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsubd
__m256i _mm256_sub_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_sub_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsubd
__m512i _mm512_mask_sub_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_sub_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpsubd
__m512i _mm512_maskz_sub_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_sub_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsubd
__m512i _mm512_sub_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_sub_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR dst[MAX:512] := 0
vpsubq
__m128i _mm_mask_sub_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_sub_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubq
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpsubq
__m128i _mm_maskz_sub_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_sub_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubq
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
psubq
__m128i _mm_sub_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sub_epi64 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubq xmm, xmm
CPUID Flags: SSE2

Description

Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere11
Nehalem11
vpsubq
__m256i _mm256_mask_sub_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_sub_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubq
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpsubq
__m256i _mm256_maskz_sub_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_sub_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubq
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsubq
__m256i _mm256_sub_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_sub_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsubq
__m512i _mm512_mask_sub_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_sub_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpsubq
__m512i _mm512_maskz_sub_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_sub_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsubq
__m512i _mm512_sub_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_sub_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR dst[MAX:512] := 0
vpsubb
__m128i _mm_mask_sub_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_sub_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] - b[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpsubb
__m128i _mm_maskz_sub_epi8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_sub_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] - b[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
psubb
__m128i _mm_sub_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_sub_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubb xmm, xmm
CPUID Flags: SSE2

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := a[i+7:i] - b[i+7:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpsubb
__m256i _mm256_mask_sub_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_sub_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] - b[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpsubb
__m256i _mm256_maskz_sub_epi8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_sub_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] - b[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpsubb
__m256i _mm256_sub_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_sub_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := a[i+7:i] - b[i+7:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsubb
__m512i _mm512_mask_sub_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_sub_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] - b[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpsubb
__m512i _mm512_maskz_sub_epi8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_sub_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := a[i+7:i] - b[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpsubb
__m512i _mm512_sub_epi8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_sub_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst.

Operation

FOR j := 0 to 63 i := j*8 dst[i+7:i] := a[i+7:i] - b[i+7:i] ENDFOR dst[MAX:512] := 0
vsubpd
__m128d _mm_mask_sub_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_sub_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vsubpd
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vsubpd
__m128d _mm_maskz_sub_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_sub_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vsubpd
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
subpd
__m128d _mm_sub_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_sub_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: subpd xmm, xmm
CPUID Flags: SSE2

Description

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vsubpd
__m256d _mm256_mask_sub_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_sub_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vsubpd
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vsubpd
__m256d _mm256_maskz_sub_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_sub_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vsubpd
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vsubpd
__m256d _mm256_sub_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_sub_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vsubpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vsubpd
__m512d _mm512_mask_sub_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_sub_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vsubpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vsubpd
__m512d _mm512_maskz_sub_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_sub_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vsubpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vsubpd
__m512d _mm512_sub_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_sub_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vsubpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR dst[MAX:512] := 0
vsubps
__m128 _mm_mask_sub_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_sub_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vsubps
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vsubps
__m128 _mm_maskz_sub_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_sub_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vsubps
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
subps
__m128 _mm_sub_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_sub_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: subps xmm, xmm
CPUID Flags: SSE

Description

Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
vsubps
__m256 _mm256_mask_sub_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_sub_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vsubps
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vsubps
__m256 _mm256_maskz_sub_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_sub_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vsubps
CPUID Flags: AVX512VL + AVX512F

Description

Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vsubps
__m256 _mm256_sub_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_sub_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vsubps ymm, ymm, ymm
CPUID Flags: AVX

Description

Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
vsubps
__m512 _mm512_mask_sub_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_sub_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vsubps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vsubps
__m512 _mm512_maskz_sub_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_sub_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vsubps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vsubps
__m512 _mm512_sub_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_sub_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vsubps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR dst[MAX:512] := 0
vsubpd
__m512d _mm512_mask_sub_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_mask_sub_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vsubpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vsubpd
__m512d _mm512_maskz_sub_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_maskz_sub_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vsubpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vsubpd
__m512d _mm512_sub_round_pd (__m512d a, __m512d b, int rounding)

Synopsis

__m512d _mm512_sub_round_pd (__m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vsubpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR dst[MAX:512] := 0
vsubps
__m512 _mm512_mask_sub_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_mask_sub_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vsubps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vsubps
__m512 _mm512_maskz_sub_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_maskz_sub_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vsubps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F

Description

Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vsubps
__m512 _mm512_sub_round_ps (__m512 a, __m512 b, int rounding)

Synopsis

__m512 _mm512_sub_round_ps (__m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vsubps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR dst[MAX:512] := 0
vsubsd
__m128d _mm_mask_sub_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_mask_sub_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vsubsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := a[63:0] - b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vsubsd
__m128d _mm_maskz_sub_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_maskz_sub_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vsubsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[63:0] := a[63:0] - b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vsubsd
__m128d _mm_sub_round_sd (__m128d a, __m128d b, int rounding)

Synopsis

__m128d _mm_sub_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vsubsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[63:0] := a[63:0] - b[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0
vsubss
__m128 _mm_mask_sub_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_mask_sub_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vsubss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := a[31:0] - b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vsubss
__m128 _mm_maskz_sub_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_maskz_sub_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vsubss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

IF k[0] dst[31:0] := a[31:0] - b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vsubss
__m128 _mm_sub_round_ss (__m128 a, __m128 b, int rounding)

Synopsis

__m128 _mm_sub_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vsubss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F

Description

Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

dst[31:0] := a[31:0] - b[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0
vsubsd
__m128d _mm_mask_sub_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_sub_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vsubsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := a[63:0] - b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
vsubsd
__m128d _mm_maskz_sub_sd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_sub_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vsubsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.

Operation

IF k[0] dst[63:0] := a[63:0] - b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
subsd
__m128d _mm_sub_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_sub_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: subsd xmm, xmm
CPUID Flags: SSE2

Description

Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := a[63:0] - b[63:0] dst[127:64] := a[127:64]

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
psubq
__m64 _mm_sub_si64 (__m64 a, __m64 b)

Synopsis

__m64 _mm_sub_si64 (__m64 a, __m64 b)
#include "emmintrin.h"
Instruction: psubq mm, mm
CPUID Flags: SSE2

Description

Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.

Operation

dst[63:0] := a[63:0] - b[63:0]

Performance

ArchitectureLatencyThroughput
Haswell1-
vsubss
__m128 _mm_mask_sub_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_sub_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vsubss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := a[31:0] - b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
vsubss
__m128 _mm_maskz_sub_ss (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_sub_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vsubss xmm {k}, xmm, xmm
CPUID Flags: AVX512F

Description

Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

IF k[0] dst[31:0] := a[31:0] - b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
subss
__m128 _mm_sub_ss (__m128 a, __m128 b)

Synopsis

__m128 _mm_sub_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: subss xmm, xmm
CPUID Flags: SSE

Description

Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.

Operation

dst[31:0] := a[31:0] - b[31:0] dst[127:32] := a[127:32]

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge31
Sandy Bridge31
Westmere31
Nehalem31
sbb
unsigned char _subborrow_u32 (unsigned char b_in, unsigned int a, unsigned int b, unsigned int * out)

Synopsis

unsigned char _subborrow_u32 (unsigned char b_in, unsigned int a, unsigned int b, unsigned int * out)
#include "immintrin.h"
Instruction: sbb r32, r32

Description

Add unsigned 8-bit borrow b_in (carry flag) to unsigned 32-bit integer a, and subtract the result from unsigned 32-bit integer b. Store the unsigned 32-bit result in out, and the carry-out in dst (carry or overflow flag).

Operation

dst:out[31:0] := (b[31:0] - (a[31:0] + b_in));
sbb
unsigned char _subborrow_u64 (unsigned char b_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)

Synopsis

unsigned char _subborrow_u64 (unsigned char b_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
#include "immintrin.h"
Instruction: sbb r64, r64

Description

Add unsigned 8-bit borrow b_in (carry flag) to unsigned 64-bit integer a, and subtract the result from unsigned 64-bit integer b. Store the unsigned 64-bit result in out, and the carry-out in dst (carry or overflow flag).

Operation

dst:out[63:0] := (b[63:0] - (a[63:0] + b_in));
vpsubrd
__m512i _mm512_mask_subr_epi32 (__m512i src, __mmask16 k, __m512i v2, __m512i v3)

Synopsis

__m512i _mm512_mask_subr_epi32 (__m512i src, __mmask16 k, __m512i v2, __m512i v3)
#include "immintrin.h"
Instruction: vpsubrd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed 32-bit integer elements in v2 from v3 storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set)

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := v3[i+31:i] - v2[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpsubrd
__m512i _mm512_subr_epi32 (__m512i v2, __m512i v3)

Synopsis

__m512i _mm512_subr_epi32 (__m512i v2, __m512i v3)
#include "immintrin.h"
Instruction: vpsubrd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed 32-bit integer elements in v2 from v3 storing the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := v3[i+31:i] - v2[i+31:i] ENDFOR dst[MAX:512] := 0
vsubrpd
__m512d _mm512_mask_subr_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3)

Synopsis

__m512d _mm512_mask_subr_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3)
#include "immintrin.h"
Instruction: vsubrpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in v2 from v3 storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := v3[i+63:i] - v2[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vsubrpd
__m512d _mm512_subr_pd (__m512d v2, __m512d v3)

Synopsis

__m512d _mm512_subr_pd (__m512d v2, __m512d v3)
#include "immintrin.h"
Instruction: vsubrpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in v2 from v3 storing the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := v3[i+63:i] - v2[i+63:i] ENDFOR dst[MAX:512] := 0
vsubrps
__m512 _mm512_mask_subr_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3)

Synopsis

__m512 _mm512_mask_subr_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3)
#include "immintrin.h"
Instruction: vsubrps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in v2 from v3 storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := v3[i+31:i] - v2[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vsubrps
__m512 _mm512_subr_ps (__m512 v2, __m512 v3)

Synopsis

__m512 _mm512_subr_ps (__m512 v2, __m512 v3)
#include "immintrin.h"
Instruction: vsubrps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in v2 from v3 storing the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := v3[i+31:i] - v2[i+31:i] ENDFOR dst[MAX:512] := 0
vsubrpd
__m512d _mm512_mask_subr_round_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3, int rounding)

Synopsis

__m512d _mm512_mask_subr_round_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3, int rounding)
#include "immintrin.h"
Instruction: vsubrpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in v2 from v3 storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := v3[i+63:i] - v2[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vsubrpd
__m512d _mm512_subr_round_pd (__m512d v2, __m512d v3, int rounding)

Synopsis

__m512d _mm512_subr_round_pd (__m512d v2, __m512d v3, int rounding)
#include "immintrin.h"
Instruction: vsubrpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in v2 from v3 storing the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := v3[i+63:i] - v2[i+63:i] ENDFOR dst[MAX:512] := 0
vsubrps
__m512 _mm512_mask_subr_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, int rounding)

Synopsis

__m512 _mm512_mask_subr_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, int rounding)
#include "immintrin.h"
Instruction: vsubrps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in v2 from v3 storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := v3[i+31:i] - v2[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vsubrps
__m512 _mm512_subr_round_ps (__m512 v2, __m512 v3, int rounding)

Synopsis

__m512 _mm512_subr_round_ps (__m512 v2, __m512 v3, int rounding)
#include "immintrin.h"
Instruction: vsubrps zmm {k}, zmm, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in v2 from v3 storing the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := v3[i+31:i] - v2[i+31:i]) ENDFOR dst[MAX:512] := 0
vpsubrsetbd
__m512i _mm512_mask_subrsetb_epi32 (__m512i v2, __mmask16 k, __mmask16 k_old, __m512i v3, __mmask16 * borrow)

Synopsis

__m512i _mm512_mask_subrsetb_epi32 (__m512i v2, __mmask16 k, __mmask16 k_old, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsubrsetbd zmm {k}, k, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed 32-bit integer elements in v2 from v3, storing the results in dst and v2. The borrowed value from the subtraction difference for the nth element is written to the nth bit of borrow (borrow flag). Results are written using writemask k (elements are copied from k to k_old when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] diff := v3[i+31:i] - v2[i+31:i] borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i]) dst[i+31:i] := diff v2[i+31:i] := diff ELSE borrow[j] := k_old[j] FI ENDFOR dst[MAX:512] := 0
vpsubrsetbd
__m512i _mm512_subrsetb_epi32 (__m512i v2, __m512i v3, __mmask16 * borrow)

Synopsis

__m512i _mm512_subrsetb_epi32 (__m512i v2, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsubrsetbd zmm {k}, k, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed 32-bit integer elements in v2 from v3, storing the results in dst and v2. The borrowed value from the subtraction difference for the nth element is written to the nth bit of borrow (borrow flag).

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := v3[i+31:i] - v2[i+31:i] borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i]) ENDFOR dst[MAX:512] := 0
vpsubsw
__m128i _mm_mask_subs_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_subs_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpsubsw
__m128i _mm_maskz_subs_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_subs_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
psubsw
__m128i _mm_subs_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_subs_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubsw xmm, xmm
CPUID Flags: SSE2

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpsubsw
__m256i _mm256_mask_subs_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_subs_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsubsw
__m256i _mm256_maskz_subs_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_subs_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsubsw
__m256i _mm256_subs_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_subs_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubsw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsubsw
__m512i _mm512_mask_subs_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_subs_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsubsw
__m512i _mm512_maskz_subs_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_subs_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsubsw
__m512i _mm512_subs_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_subs_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512BW

Description

Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i]) ENDFOR dst[MAX:512] := 0
vpsubsb
__m128i _mm_mask_subs_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_subs_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpsubsb
__m128i _mm_maskz_subs_epi8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_subs_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
psubsb
__m128i _mm_subs_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_subs_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubsb xmm, xmm
CPUID Flags: SSE2

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpsubsb
__m256i _mm256_mask_subs_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_subs_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpsubsb
__m256i _mm256_maskz_subs_epi8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_subs_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsubsb
__m256i _mm256_subs_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_subs_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubsb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsubsb
__m512i _mm512_mask_subs_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_subs_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpsubsb
__m512i _mm512_maskz_subs_epi8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_subs_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsubsb
__m512i _mm512_subs_epi8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_subs_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512BW

Description

Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst.

Operation

FOR j := 0 to 63 i := j*8 dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i]) ENDFOR dst[MAX:512] := 0
vpsubusw
__m128i _mm_mask_subs_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_subs_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpsubusw
__m128i _mm_maskz_subs_epu16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_subs_epu16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
psubusw
__m128i _mm_subs_epu16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_subs_epu16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubusw xmm, xmm
CPUID Flags: SSE2

Description

Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpsubusw
__m256i _mm256_mask_subs_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_subs_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpsubusw
__m256i _mm256_maskz_subs_epu16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_subs_epu16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsubusw
__m256i _mm256_subs_epu16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_subs_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubusw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*16 dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsubusw
__m512i _mm512_mask_subs_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_subs_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512BW

Description

Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpsubusw
__m512i _mm512_maskz_subs_epu16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_subs_epu16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512BW

Description

Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsubusw
__m512i _mm512_subs_epu16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_subs_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512BW

Description

Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*16 dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i]) ENDFOR dst[MAX:512] := 0
vpsubusb
__m128i _mm_mask_subs_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_subs_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpsubusb
__m128i _mm_maskz_subs_epu8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_subs_epu8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
psubusb
__m128i _mm_subs_epu8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_subs_epu8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubusb xmm, xmm
CPUID Flags: SSE2

Description

Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*8 dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i]) ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.5
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpsubusb
__m256i _mm256_mask_subs_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_subs_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpsubusb
__m256i _mm256_maskz_subs_epu8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_subs_epu8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512VL + AVX512BW

Description

Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpsubusb
__m256i _mm256_subs_epu8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_subs_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubusb ymm, ymm, ymm
CPUID Flags: AVX2

Description

Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst.

Operation

FOR j := 0 to 31 i := j*8 dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i]) ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpsubusb
__m512i _mm512_mask_subs_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_subs_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512BW

Description

Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpsubusb
__m512i _mm512_maskz_subs_epu8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_subs_epu8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512BW

Description

Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpsubusb
__m512i _mm512_subs_epu8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_subs_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512BW

Description

Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst.

Operation

FOR j := 0 to 63 i := j*8 dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i]) ENDFOR dst[MAX:512] := 0
vpsubsetbd
__m512i _mm512_mask_subsetb_epi32 (__m512i v2, __mmask16 k, __mmask16 k_old, __m512i v3, __mmask16 * borrow)

Synopsis

__m512i _mm512_mask_subsetb_epi32 (__m512i v2, __mmask16 k, __mmask16 k_old, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsubsetbd zmm {k}, k, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed 32-bit integer elements in v3 from v2, storing the results in dst and the nth borrow bit in the nth position of borrow (borrow flag). Results are stored using writemask k (elements are copied from v2 and k_old when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := v2[i+31:i] - v3[i+31:i] borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] borrow[j] := k_old[j] FI ENDFOR dst[MAX:512] := 0
vpsubsetbd
__m512i _mm512_subsetb_epi32 (__m512i v2, __m512i v3, __mmask16 * borrow)

Synopsis

__m512i _mm512_subsetb_epi32 (__m512i v2, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsubsetbd zmm {k}, k, zmm
CPUID Flags: KNCNI

Description

Performs element-by-element subtraction of packed 32-bit integer elements in v3 from v2, storing the results in dst and the nth borrow bit in the nth position of borrow (borrow flag).

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := v2[i+31:i] - v3[i+31:i] borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m128d _mm_svml_ceil_pd (__m128d a)

Synopsis

__m128d _mm_svml_ceil_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := CEIL(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_svml_ceil_pd (__m256d a)

Synopsis

__m256d _mm256_svml_ceil_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := CEIL(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m128 _mm_svml_ceil_ps (__m128 a)

Synopsis

__m128 _mm_svml_ceil_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := CEIL(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_svml_ceil_ps (__m256 a)

Synopsis

__m256 _mm256_svml_ceil_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := CEIL(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m128d _mm_svml_floor_pd (__m128d a)

Synopsis

__m128d _mm_svml_floor_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := FLOOR(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_svml_floor_pd (__m256d a)

Synopsis

__m256d _mm256_svml_floor_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := FLOOR(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m128 _mm_svml_floor_ps (__m128 a)

Synopsis

__m128 _mm_svml_floor_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := FLOOR(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_svml_floor_ps (__m256 a)

Synopsis

__m256 _mm256_svml_floor_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := FLOOR(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m128d _mm_svml_round_pd (__m128d a)

Synopsis

__m128d _mm_svml_round_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Round the packed double-precision (64-bit) floating-point elements in a to the nearest integer value, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := ROUND(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_svml_round_pd (__m256d a)

Synopsis

__m256d _mm256_svml_round_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Round the packed double-precision (64-bit) floating-point elements in a to the nearest integer value, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := ROUND(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_mask_svml_round_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_svml_round_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Round the packed double-precision (64-bit) floating-point elements in a to the nearest integer value, and store the results as packed double-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ROUND(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_svml_round_pd (__m512d a)

Synopsis

__m512d _mm512_svml_round_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Round the packed double-precision (64-bit) floating-point elements in a to the nearest integer value, and store the results as packed double-precision floating-point elements in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := ROUND(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128 _mm_svml_round_ps (__m128 a)

Synopsis

__m128 _mm_svml_round_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Round the packed single-precision (32-bit) floating-point elements in a to the nearest integer value, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := ROUND(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_svml_round_ps (__m256 a)

Synopsis

__m256 _mm256_svml_round_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Round the packed single-precision (32-bit) floating-point elements in a to the nearest integer value, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := ROUND(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m128d _mm_svml_sqrt_pd (__m128d a)

Synopsis

__m128d _mm_svml_sqrt_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. Note that this intrinsic is less efficient than _mm_sqrt_pd.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := SQRT(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_svml_sqrt_pd (__m256d a)

Synopsis

__m256d _mm256_svml_sqrt_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. Note that this intrinsic is less efficient than _mm_sqrt_pd.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := SQRT(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m128 _mm_svml_sqrt_ps (__m128 a)

Synopsis

__m128 _mm_svml_sqrt_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. Note that this intrinsic is less efficient than _mm_sqrt_ps.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_svml_sqrt_ps (__m256 a)

Synopsis

__m256 _mm256_svml_sqrt_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. Note that this intrinsic is less efficient than _mm_sqrt_ps.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512i _mm512_mask_swizzle_epi32 (__m512i src, __mmask16 k, __m512i v, _MM_SWIZZLE_ENUM s)

Synopsis

__m512i _mm512_mask_swizzle_epi32 (__m512i src, __mmask16 k, __m512i v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Performs a swizzle transformation of each of the four groups of packed 4x32-bit integer elements in v using swizzle parameter s, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE s OF _MM_SWIZ_REG_NONE: dst[511:0] := v[511:0] _MM_SWIZ_REG_DCBA: dst[511:0] := v[511:0] _MM_SWIZ_REG_CDAB: FOR j := 0 to 7 i := j*64 IF k[j*2] dst[i+31:i] := v[i+63:i+32] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*2+1] dst[i+63:i+32] := v[i+31:i] ELSE dst[i+63:i+32] := src[i+63:i+32] FI ENDFOR _MM_SWIZ_REG_BADC: FOR j := 0 to 3 i := j*128 IF k[j*4] dst[i+31:i] := v[i+95:i+64] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*4+1] dst[i+63:i+32] := v[i+127:i+96] ELSE dst[i+63:i+32] := src[i+63:i+32] FI IF k[j*4+2] dst[i+95:i+64] := v[i+31:i] ELSE dst[i+95:i+64] := src[i+95:i+64] FI IF k[j*4+3] dst[i+127:i+96] := v[i+63:i+32] ELSE dst[i+127:i+96] := src[i+127:i+96] FI ENDFOR _MM_SWIZ_REG_AAAA: FOR j := 0 to 3 i := j*128 IF k[j*4] dst[i+31:i] := v[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*4+1] dst[i+63:i+32] := v[i+31:i] ELSE dst[i+63:i+32] := src[i+63:i+32] FI IF k[j*4+2] dst[i+95:i+64] := v[i+31:i] ELSE dst[i+95:i+64] := src[i+95:i+64] FI IF k[j*4+3] dst[i+127:i+96] := v[i+31:i] ELSE dst[i+127:i+96] := src[i+127:i+96] FI ENDFOR _MM_SWIZ_REG_BBBB: FOR j := 0 to 3 i := j*128 IF k[j*4] dst[i+31:i] := v[i+63:i+32] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*4+1] dst[i+63:i+32] := v[i+63:i+32] ELSE dst[i+63:i+32] := src[i+63:i+32] FI IF k[j*4+2] dst[i+95:i+64] := v[i+63:i+32] ELSE dst[i+95:i+64] := src[i+95:i+64] FI IF k[j*4+3] dst[i+127:i+96] := v[i+63:i+32] ELSE dst[i+127:i+96] := src[i+127:i+96] FI ENDFOR _MM_SWIZ_REG_CCCC: FOR j := 0 to 3 i := j*128 IF k[j*4] dst[i+31:i] := v[i+95:i+64] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*4+1] dst[i+63:i+32] := v[i+95:i+64] ELSE dst[i+63:i+32] := src[i+63:i+32] FI IF k[j*4+2] dst[i+95:i+64] := v[i+95:i+64] ELSE dst[i+95:i+64] := src[i+95:i+64] FI IF k[j*4+3] dst[i+127:i+96] := v[i+95:i+64] ELSE dst[i+127:i+96] := src[i+127:i+96] FI ENDFOR _MM_SWIZ_REG_DDDD: FOR j := 0 to 3 i := j*128 IF k[j*4] dst[i+31:i] := v[i+127:i+96] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*4+1] dst[i+63:i+32] := v[i+127:i+96] ELSE dst[i+63:i+32] := src[i+63:i+32] FI IF k[j*4+2] dst[i+95:i+64] := v[i+127:i+96] ELSE dst[i+95:i+64] := src[i+95:i+64] FI IF k[j*4+3] dst[i+127:i+96] := v[i+127:i+96] ELSE dst[i+127:i+96] := src[i+127:i+96] FI ENDFOR _MM_SWIZ_REG_DACB: FOR j := 0 to 3 i := j*128 IF k[j*4] dst[i+31:i] := v[i+63:i+32] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*4+1] dst[i+63:i+32] := v[i+95:i+64] ELSE dst[i+63:i+32] := src[i+63:i+32] FI IF k[j*4+2] dst[i+95:i+64] := v[i+31:i] ELSE dst[i+95:i+64] := src[i+95:i+64] FI IF k[j*4+3] dst[i+127:i+96] := v[i+127:i+96] ELSE dst[i+127:i+96] := src[i+127:i+96] FI ENDFOR ESAC dst[MAX:512] := 0
...
__m512i _mm512_swizzle_epi32 (__m512i v, _MM_SWIZZLE_ENUM s)

Synopsis

__m512i _mm512_swizzle_epi32 (__m512i v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Performs a swizzle transformation of each of the four groups of packed 4x 32-bit integer elements in v using swizzle parameter s, storing the results in dst.

Operation

CASE s OF _MM_SWIZ_REG_NONE: dst[511:0] := v[511:0] _MM_SWIZ_REG_DCBA: dst[511:0] := v[511:0] _MM_SWIZ_REG_CDAB: FOR j := 0 to 7 i := j*64 dst[i+31:i] := v[i+63:i+32] dst[i+63:i+32] := v[i+31:i] ENDFOR _MM_SWIZ_REG_BADC: FOR j := 0 to 3 i := j*128 dst[i+31:i] := v[i+95:i+64] dst[i+63:i+32] := v[i+127:i+96] dst[i+95:i+64] := v[i+31:i] dst[i+127:i+96] := v[i+63:i+32] ENDFOR _MM_SWIZ_REG_AAAA: FOR j := 0 to 3 i := j*128 dst[i+31:i] := v[i+31:i] dst[i+63:i+32] := v[i+31:i] dst[i+95:i+64] := v[i+31:i] dst[i+127:i+96] := v[i+31:i] ENDFOR _MM_SWIZ_REG_BBBB: FOR j := 0 to 3 i := j*128 dst[i+31:i] := v[i+63:i+32] dst[i+63:i+32] := v[i+63:i+32] dst[i+95:i+64] := v[i+63:i+32] dst[i+127:i+96] := v[i+63:i+32] ENDFOR _MM_SWIZ_REG_CCCC: FOR j := 0 to 3 i := j*128 dst[i+31:i] := v[i+95:i+64] dst[i+63:i+32] := v[i+95:i+64] dst[i+95:i+64] := v[i+95:i+64] dst[i+127:i+96] := v[i+95:i+64] ENDFOR _MM_SWIZ_REG_DDDD: FOR j := 0 to 3 i := j*128 dst[i+31:i] := v[i+127:i+96] dst[i+63:i+32] := v[i+127:i+96] dst[i+95:i+64] := v[i+127:i+96] dst[i+127:i+96] := v[i+127:i+96] ENDFOR _MM_SWIZ_REG_DACB: FOR j := 0 to 3 i := j*128 dst[i+31:i] := v[i+63:i+32] dst[i+63:i+32] := v[i+95:i+64] dst[i+95:i+64] := v[i+31:i] dst[i+127:i+96] := v[i+127:i+96] ENDFOR ESAC dst[MAX:512] := 0
...
__m512i _mm512_mask_swizzle_epi64 (__m512i src, __mmask8 k, __m512i v, _MM_SWIZZLE_ENUM s)

Synopsis

__m512i _mm512_mask_swizzle_epi64 (__m512i src, __mmask8 k, __m512i v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Performs a swizzle transformation of each of the four groups of packed 4x64-bit integer elements in v using swizzle parameter s, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE s OF _MM_SWIZ_REG_NONE: dst[511:0] := v[511:0] _MM_SWIZ_REG_DCBA: dst[511:0] := v[511:0] _MM_SWIZ_REG_CDAB: FOR j := 0 to 3 i := j*64 IF k[j*2] dst[i+63:i] := v[i+127:i+64] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*2+1] dst[i+127:i+64] := v[i+63:i] ELSE dst[i+127:i+64] := src[i+127:i+64] FI ENDFOR _MM_SWIZ_REG_BADC: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+191:i+128] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*4+1] dst[i+127:i+64] := v[i+255:i+192] ELSE dst[i+127:i+64] := src[i+127:i+64] FI IF k[j*4+2] dst[i+191:i+128] := v[i+63:i] ELSE dst[i+191:i+128] := src[i+191:i+128] FI IF k[j*4+3] dst[i+255:i+192] := v[i+127:i+64] ELSE dst[i+255:i+192] := src[i+255:i+192] FI ENDFOR _MM_SWIZ_REG_AAAA: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*4+1] dst[i+127:i+64] := v[i+63:i] ELSE dst[i+127:i+64] := src[i+127:i+64] FI IF k[j*4+2] dst[i+191:i+128] := v[i+63:i] ELSE dst[i+191:i+128] := src[i+191:i+128] FI IF k[j*4+3] dst[i+255:i+192] := v[i+63:i] ELSE dst[i+255:i+192] := src[i+255:i+192] FI ENDFOR _MM_SWIZ_REG_BBBB: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+127:i+63] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*4+1] dst[i+127:i+64] := v[i+127:i+63] ELSE dst[i+127:i+64] := src[i+127:i+64] FI IF k[j*4+2] dst[i+191:i+128] := v[i+127:i+63] ELSE dst[i+191:i+128] := src[i+191:i+128] FI IF k[j*4+3] dst[i+255:i+192] := v[i+127:i+63] ELSE dst[i+255:i+192] := src[i+255:i+192] FI ENDFOR _MM_SWIZ_REG_CCCC: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+191:i+128] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*4+1] dst[i+127:i+64] := v[i+191:i+128] ELSE dst[i+127:i+64] := src[i+127:i+64] FI IF k[j*4+2] dst[i+191:i+128] := v[i+191:i+128] ELSE dst[i+191:i+128] := src[i+191:i+128] FI IF k[j*4+3] dst[i+255:i+192] := v[i+191:i+128] ELSE dst[i+255:i+192] := src[i+255:i+192] FI ENDFOR _MM_SWIZ_REG_DDDD: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+255:i+192] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*4+1] dst[i+127:i+64] := v[i+255:i+192] ELSE dst[i+127:i+64] := src[i+127:i+64] FI IF k[j*4+2] dst[i+191:i+128] := v[i+255:i+192] ELSE dst[i+191:i+128] := src[i+191:i+128] FI IF k[j*4+3] dst[i+255:i+192] := v[i+255:i+192] ELSE dst[i+255:i+192] := src[i+255:i+192] FI ENDFOR _MM_SWIZ_REG_DACB: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+127:i+64] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*4+1] dst[i+127:i+64] := v[i+191:i+128] ELSE dst[i+127:i+64] := src[i+127:i+64] FI IF k[j*4+2] dst[i+191:i+128] := v[i+63:i] ELSE dst[i+191:i+128] := src[i+191:i+128] FI IF k[j*4+3] dst[i+255:i+192] := v[i+255:i+192] ELSE dst[i+255:i+192] := src[i+255:i+192] FI ENDFOR ESAC dst[MAX:512] := 0
...
__m512i _mm512_swizzle_epi64 (__m512i v, _MM_SWIZZLE_ENUM s)

Synopsis

__m512i _mm512_swizzle_epi64 (__m512i v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Performs a swizzle transformation of each of the two groups of packed 4x64-bit integer elements in v using swizzle parameter s, storing the results in dst.

Operation

CASE s OF _MM_SWIZ_REG_NONE: dst[511:0] := v[511:0] _MM_SWIZ_REG_DCBA: dst[511:0] := v[511:0] _MM_SWIZ_REG_CDAB: FOR j := 0 to 3 i := j*64 dst[i+63:i] := v[i+127:i+64] dst[i+127:i+64] := v[i+63:i] ENDFOR _MM_SWIZ_REG_BADC: FOR j := 0 to 1 i := j*256 dst[i+63:i] := v[i+191:i+128] dst[i+127:i+64] := v[i+255:i+192] dst[i+191:i+128] := v[i+63:i] dst[i+255:i+192] := v[i+127:i+64] ENDFOR _MM_SWIZ_REG_AAAA: FOR j := 0 to 1 i := j*256 dst[i+63:i] := v[i+63:i] dst[i+127:i+64] := v[i+63:i] dst[i+191:i+128] := v[i+63:i] dst[i+255:i+192] := v[i+63:i] ENDFOR _MM_SWIZ_REG_BBBB: FOR j := 0 to 1 i := j*256 dst[i+63:i] := v[i+127:i+63] dst[i+127:i+64] := v[i+127:i+63] dst[i+191:i+128] := v[i+127:i+63] dst[i+255:i+192] := v[i+127:i+63] ENDFOR _MM_SWIZ_REG_CCCC: FOR j := 0 to 1 i := j*256 dst[i+63:i] := v[i+191:i+128] dst[i+127:i+64] := v[i+191:i+128] dst[i+191:i+128] := v[i+191:i+128] dst[i+255:i+192] := v[i+191:i+128] ENDFOR _MM_SWIZ_REG_DDDD: FOR j := 0 to 1 i := j*256 dst[i+63:i] := v[i+255:i+192] dst[i+127:i+64] := v[i+255:i+192] dst[i+191:i+128] := v[i+255:i+192] dst[i+255:i+192] := v[i+255:i+192] ENDFOR _MM_SWIZ_REG_DACB: FOR j := 0 to 1 i := j*256 dst[i+63:i] := v[i+127:i+64] dst[i+127:i+64] := v[i+191:i+128] dst[i+191:i+128] := v[i+63:i] dst[i+255:i+192] := v[i+255:i+192] ENDFOR ESAC dst[MAX:512] := 0
...
__m512d _mm512_mask_swizzle_pd (__m512d src, __mmask8 k, __m512d v, _MM_SWIZZLE_ENUM s)

Synopsis

__m512d _mm512_mask_swizzle_pd (__m512d src, __mmask8 k, __m512d v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in v using swizzle parameter s, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE s OF _MM_SWIZ_REG_NONE: dst[511:0] := v[511:0] _MM_SWIZ_REG_DCBA: dst[511:0] := v[511:0] _MM_SWIZ_REG_CDAB: FOR j := 0 to 3 i := j*64 IF k[j*2] dst[i+63:i] := v[i+127:i+64] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*2+1] dst[i+127:i+64] := v[i+63:i] ELSE dst[i+127:i+64] := src[i+127:i+64] FI ENDFOR _MM_SWIZ_REG_BADC: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+191:i+128] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*4+1] dst[i+127:i+64] := v[i+255:i+192] ELSE dst[i+127:i+64] := src[i+127:i+64] FI IF k[j*4+2] dst[i+191:i+128] := v[i+63:i] ELSE dst[i+191:i+128] := src[i+191:i+128] FI IF k[j*4+3] dst[i+255:i+192] := v[i+127:i+64] ELSE dst[i+255:i+192] := src[i+255:i+192] FI ENDFOR _MM_SWIZ_REG_AAAA: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*4+1] dst[i+127:i+64] := v[i+63:i] ELSE dst[i+127:i+64] := src[i+127:i+64] FI IF k[j*4+2] dst[i+191:i+128] := v[i+63:i] ELSE dst[i+191:i+128] := src[i+191:i+128] FI IF k[j*4+3] dst[i+255:i+192] := v[i+63:i] ELSE dst[i+255:i+192] := src[i+255:i+192] FI ENDFOR _MM_SWIZ_REG_BBBB: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+127:i+63] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*4+1] dst[i+127:i+64] := v[i+127:i+63] ELSE dst[i+127:i+64] := src[i+127:i+64] FI IF k[j*4+2] dst[i+191:i+128] := v[i+127:i+63] ELSE dst[i+191:i+128] := src[i+191:i+128] FI IF k[j*4+3] dst[i+255:i+192] := v[i+127:i+63] ELSE dst[i+255:i+192] := src[i+255:i+192] FI ENDFOR _MM_SWIZ_REG_CCCC: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+191:i+128] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*4+1] dst[i+127:i+64] := v[i+191:i+128] ELSE dst[i+127:i+64] := src[i+127:i+64] FI IF k[j*4+2] dst[i+191:i+128] := v[i+191:i+128] ELSE dst[i+191:i+128] := src[i+191:i+128] FI IF k[j*4+3] dst[i+255:i+192] := v[i+191:i+128] ELSE dst[i+255:i+192] := src[i+255:i+192] FI ENDFOR _MM_SWIZ_REG_DDDD: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+255:i+192] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*4+1] dst[i+127:i+64] := v[i+255:i+192] ELSE dst[i+127:i+64] := src[i+127:i+64] FI IF k[j*4+2] dst[i+191:i+128] := v[i+255:i+192] ELSE dst[i+191:i+128] := src[i+191:i+128] FI IF k[j*4+3] dst[i+255:i+192] := v[i+255:i+192] ELSE dst[i+255:i+192] := src[i+255:i+192] FI ENDFOR _MM_SWIZ_REG_DACB: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+127:i+64] ELSE dst[i+63:i] := src[i+63:i] FI IF k[j*4+1] dst[i+127:i+64] := v[i+191:i+128] ELSE dst[i+127:i+64] := src[i+127:i+64] FI IF k[j*4+2] dst[i+191:i+128] := v[i+63:i] ELSE dst[i+191:i+128] := src[i+191:i+128] FI IF k[j*4+3] dst[i+255:i+192] := v[i+255:i+192] ELSE dst[i+255:i+192] := src[i+255:i+192] FI ENDFOR ESAC dst[MAX:512] := 0
...
__m512d _mm512_swizzle_pd (__m512d v, _MM_SWIZZLE_ENUM s)

Synopsis

__m512d _mm512_swizzle_pd (__m512d v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in v using swizzle parameter s, storing the results in dst.

Operation

CASE s OF _MM_SWIZ_REG_NONE: dst[511:0] := v[511:0] _MM_SWIZ_REG_DCBA: dst[511:0] := v[511:0] _MM_SWIZ_REG_CDAB: FOR j := 0 to 3 i := j*64 dst[i+63:i] := v[i+127:i+64] dst[i+127:i+64] := v[i+63:i] ENDFOR _MM_SWIZ_REG_BADC: FOR j := 0 to 1 i := j*256 dst[i+63:i] := v[i+191:i+128] dst[i+127:i+64] := v[i+255:i+192] dst[i+191:i+128] := v[i+63:i] dst[i+255:i+192] := v[i+127:i+64] ENDFOR _MM_SWIZ_REG_AAAA: FOR j := 0 to 1 i := j*256 dst[i+63:i] := v[i+63:i] dst[i+127:i+64] := v[i+63:i] dst[i+191:i+128] := v[i+63:i] dst[i+255:i+192] := v[i+63:i] ENDFOR _MM_SWIZ_REG_BBBB: FOR j := 0 to 1 i := j*256 dst[i+63:i] := v[i+127:i+63] dst[i+127:i+64] := v[i+127:i+63] dst[i+191:i+128] := v[i+127:i+63] dst[i+255:i+192] := v[i+127:i+63] ENDFOR _MM_SWIZ_REG_CCCC: FOR j := 0 to 1 i := j*256 dst[i+63:i] := v[i+191:i+128] dst[i+127:i+64] := v[i+191:i+128] dst[i+191:i+128] := v[i+191:i+128] dst[i+255:i+192] := v[i+191:i+128] ENDFOR _MM_SWIZ_REG_DDDD: FOR j := 0 to 1 i := j*256 dst[i+63:i] := v[i+255:i+192] dst[i+127:i+64] := v[i+255:i+192] dst[i+191:i+128] := v[i+255:i+192] dst[i+255:i+192] := v[i+255:i+192] ENDFOR _MM_SWIZ_REG_DACB: FOR j := 0 to 1 i := j*256 dst[i+63:i] := v[i+127:i+64] dst[i+127:i+64] := v[i+191:i+128] dst[i+191:i+128] := v[i+63:i] dst[i+255:i+192] := v[i+255:i+192] ENDFOR ESAC dst[MAX:512] := 0
...
__m512 _mm512_mask_swizzle_ps (__m512 src, __mmask16 k, __m512 v, _MM_SWIZZLE_ENUM s)

Synopsis

__m512 _mm512_mask_swizzle_ps (__m512 src, __mmask16 k, __m512 v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Performs a swizzle transformation of each of the four groups of packed 4x single-precision (32-bit) floating-point elements in v using swizzle parameter s, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

CASE s OF _MM_SWIZ_REG_NONE: dst[511:0] := v[511:0] _MM_SWIZ_REG_DCBA: dst[511:0] := v[511:0] _MM_SWIZ_REG_CDAB: FOR j := 0 to 7 i := j*64 IF k[j*2] dst[i+31:i] := v[i+63:i+32] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*2+1] dst[i+63:i+32] := v[i+31:i] ELSE dst[i+63:i+32] := src[i+63:i+32] FI ENDFOR _MM_SWIZ_REG_BADC: FOR j := 0 to 3 i := j*128 IF k[j*4] dst[i+31:i] := v[i+95:i+64] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*4+1] dst[i+63:i+32] := v[i+127:i+96] ELSE dst[i+63:i+32] := src[i+63:i+32] FI IF k[j*4+2] dst[i+95:i+64] := v[i+31:i] ELSE dst[i+95:i+64] := src[i+95:i+64] FI IF k[j*4+3] dst[i+127:i+96] := v[i+63:i+32] ELSE dst[i+127:i+96] := src[i+127:i+96] FI ENDFOR _MM_SWIZ_REG_AAAA: FOR j := 0 to 3 i := j*128 IF k[j*4] dst[i+31:i] := v[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*4+1] dst[i+63:i+32] := v[i+31:i] ELSE dst[i+63:i+32] := src[i+63:i+32] FI IF k[j*4+2] dst[i+95:i+64] := v[i+31:i] ELSE dst[i+95:i+64] := src[i+95:i+64] FI IF k[j*4+3] dst[i+127:i+96] := v[i+31:i] ELSE dst[i+127:i+96] := src[i+127:i+96] FI ENDFOR _MM_SWIZ_REG_BBBB: FOR j := 0 to 3 i := j*128 IF k[j*4] dst[i+31:i] := v[i+63:i+32] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*4+1] dst[i+63:i+32] := v[i+63:i+32] ELSE dst[i+63:i+32] := src[i+63:i+32] FI IF k[j*4+2] dst[i+95:i+64] := v[i+63:i+32] ELSE dst[i+95:i+64] := src[i+95:i+64] FI IF k[j*4+3] dst[i+127:i+96] := v[i+63:i+32] ELSE dst[i+127:i+96] := src[i+127:i+96] FI ENDFOR _MM_SWIZ_REG_CCCC: FOR j := 0 to 3 i := j*128 IF k[j*4] dst[i+31:i] := v[i+95:i+64] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*4+1] dst[i+63:i+32] := v[i+95:i+64] ELSE dst[i+63:i+32] := src[i+63:i+32] FI IF k[j*4+2] dst[i+95:i+64] := v[i+95:i+64] ELSE dst[i+95:i+64] := src[i+95:i+64] FI IF k[j*4+3] dst[i+127:i+96] := v[i+95:i+64] ELSE dst[i+127:i+96] := src[i+127:i+96] FI ENDFOR _MM_SWIZ_REG_DDDD: FOR j := 0 to 3 i := j*128 IF k[j*4] dst[i+31:i] := v[i+127:i+96] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*4+1] dst[i+63:i+32] := v[i+127:i+96] ELSE dst[i+63:i+32] := src[i+63:i+32] FI IF k[j*4+2] dst[i+95:i+64] := v[i+127:i+96] ELSE dst[i+95:i+64] := src[i+95:i+64] FI IF k[j*4+3] dst[i+127:i+96] := v[i+127:i+96] ELSE dst[i+127:i+96] := src[i+127:i+96] FI ENDFOR _MM_SWIZ_REG_DACB: FOR j := 0 to 3 i := j*128 IF k[j*4] dst[i+31:i] := v[i+63:i+32] ELSE dst[i+31:i] := src[i+31:i] FI IF k[j*4+1] dst[i+63:i+32] := v[i+95:i+64] ELSE dst[i+63:i+32] := src[i+63:i+32] FI IF k[j*4+2] dst[i+95:i+64] := v[i+31:i] ELSE dst[i+95:i+64] := src[i+95:i+64] FI IF k[j*4+3] dst[i+127:i+96] := v[i+127:i+96] ELSE dst[i+127:i+96] := src[i+127:i+96] FI ENDFOR ESAC dst[MAX:512] := 0
...
__m512 _mm512_swizzle_ps (__m512 v, _MM_SWIZZLE_ENUM s)

Synopsis

__m512 _mm512_swizzle_ps (__m512 v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI

Description

Performs a swizzle transformation of each of the four groups of packed 4xsingle-precision (32-bit) floating-point elements in v using swizzle parameter s, storing the results in dst.

Operation

CASE s OF _MM_SWIZ_REG_NONE: dst[511:0] := v[511:0] _MM_SWIZ_REG_DCBA: dst[511:0] := v[511:0] _MM_SWIZ_REG_CDAB: FOR j := 0 to 7 i := j*64 dst[i+31:i] := v[i+63:i+32] dst[i+63:i+32] := v[i+31:i] ENDFOR _MM_SWIZ_REG_BADC: FOR j := 0 to 3 i := j*128 dst[i+31:i] := v[i+95:i+64] dst[i+63:i+32] := v[i+127:i+96] dst[i+95:i+64] := v[i+31:i] dst[i+127:i+96] := v[i+63:i+32] ENDFOR _MM_SWIZ_REG_AAAA: FOR j := 0 to 3 i := j*128 dst[i+31:i] := v[i+31:i] dst[i+63:i+32] := v[i+31:i] dst[i+95:i+64] := v[i+31:i] dst[i+127:i+96] := v[i+31:i] ENDFOR _MM_SWIZ_REG_BBBB: FOR j := 0 to 3 i := j*128 dst[i+31:i] := v[i+63:i+32] dst[i+63:i+32] := v[i+63:i+32] dst[i+95:i+64] := v[i+63:i+32] dst[i+127:i+96] := v[i+63:i+32] ENDFOR _MM_SWIZ_REG_CCCC: FOR j := 0 to 3 i := j*128 dst[i+31:i] := v[i+95:i+64] dst[i+63:i+32] := v[i+95:i+64] dst[i+95:i+64] := v[i+95:i+64] dst[i+127:i+96] := v[i+95:i+64] ENDFOR _MM_SWIZ_REG_DDDD: FOR j := 0 to 3 i := j*128 dst[i+31:i] := v[i+127:i+96] dst[i+63:i+32] := v[i+127:i+96] dst[i+95:i+64] := v[i+127:i+96] dst[i+127:i+96] := v[i+127:i+96] ENDFOR _MM_SWIZ_REG_DACB: FOR j := 0 to 3 i := j*128 dst[i+31:i] := v[i+63:i+32] dst[i+63:i+32] := v[i+95:i+64] dst[i+95:i+64] := v[i+31:i] dst[i+127:i+96] := v[i+127:i+96] ENDFOR ESAC dst[MAX:512] := 0
...
__m128d _mm_tan_pd (__m128d a)

Synopsis

__m128d _mm_tan_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := TAN(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_tan_pd (__m256d a)

Synopsis

__m256d _mm256_tan_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := TAN(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_mask_tan_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_tan_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := TAN(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_tan_pd (__m512d a)

Synopsis

__m512d _mm512_tan_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := TAN(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128 _mm_tan_ps (__m128 a)

Synopsis

__m128 _mm_tan_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := TAN(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_tan_ps (__m256 a)

Synopsis

__m256 _mm256_tan_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := TAN(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_mask_tan_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_tan_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := TAN(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_tan_ps (__m512 a)

Synopsis

__m512 _mm512_tan_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := TAN(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m128d _mm_tand_pd (__m128d a)

Synopsis

__m128d _mm_tand_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := TAND(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_tand_pd (__m256d a)

Synopsis

__m256d _mm256_tand_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := TAND(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_mask_tand_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_tand_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := TAND(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_tand_pd (__m512d a)

Synopsis

__m512d _mm512_tand_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := TAND(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128 _mm_tand_ps (__m128 a)

Synopsis

__m128 _mm_tand_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := TAND(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_tand_ps (__m256 a)

Synopsis

__m256 _mm256_tand_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := TAND(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_mask_tand_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_tand_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := TAND(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_tand_ps (__m512 a)

Synopsis

__m512 _mm512_tand_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := TAND(a[i+31:i]) ENDFOR dst[MAX:512] := 0
...
__m128d _mm_tanh_pd (__m128d a)

Synopsis

__m128d _mm_tanh_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := TANH(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_tanh_pd (__m256d a)

Synopsis

__m256d _mm256_tanh_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := TANH(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_mask_tanh_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_tanh_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := TANH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_tanh_pd (__m512d a)

Synopsis

__m512d _mm512_tanh_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := TANH(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128 _mm_tanh_ps (__m128 a)

Synopsis

__m128 _mm_tanh_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := TANH(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_tanh_ps (__m256 a)

Synopsis

__m256 _mm256_tanh_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := TANH(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_mask_tanh_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_tanh_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := TANH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_tanh_ps (__m512 a)

Synopsis

__m512 _mm512_tanh_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := TANH(a[i+31:i]) ENDFOR dst[MAX:512] := 0
vpternlogd
__m128i _mm_mask_ternarylogic_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b, int imm8)

Synopsis

__m128i _mm_mask_ternarylogic_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vpternlogd
CPUID Flags: AVX512VL + AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] FOR h := 0 to 31 index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpternlogd
__m128i _mm_maskz_ternarylogic_epi32 (__mmask8 k, __m128i a, __m128i b, __m128i c, int imm8)

Synopsis

__m128i _mm_maskz_ternarylogic_epi32 (__mmask8 k, __m128i a, __m128i b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogd
CPUID Flags: AVX512VL + AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] FOR h := 0 to 31 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpternlogd
__m128i _mm_ternarylogic_epi32 (__m128i a, __m128i b, __m128i c, int imm8)

Synopsis

__m128i _mm_ternarylogic_epi32 (__m128i a, __m128i b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogd
CPUID Flags: AVX512VL + AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.

Operation

FOR j := 0 to 3 i := j*32 FOR h := 0 to 31 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ENDFOR dst[MAX:128] := 0
vpternlogd
__m256i _mm256_mask_ternarylogic_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b, int imm8)

Synopsis

__m256i _mm256_mask_ternarylogic_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vpternlogd
CPUID Flags: AVX512VL + AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] FOR h := 0 to 31 index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpternlogd
__m256i _mm256_maskz_ternarylogic_epi32 (__mmask8 k, __m256i a, __m256i b, __m256i c, int imm8)

Synopsis

__m256i _mm256_maskz_ternarylogic_epi32 (__mmask8 k, __m256i a, __m256i b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogd
CPUID Flags: AVX512VL + AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] FOR h := 0 to 31 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpternlogd
__m256i _mm256_ternarylogic_epi32 (__m256i a, __m256i b, __m256i c, int imm8)

Synopsis

__m256i _mm256_ternarylogic_epi32 (__m256i a, __m256i b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogd
CPUID Flags: AVX512VL + AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.

Operation

FOR j := 0 to 7 i := j*32 FOR h := 0 to 31 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ENDFOR dst[MAX:256] := 0
vpternlogd
__m512i _mm512_mask_ternarylogic_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b, int imm8)

Synopsis

__m512i _mm512_mask_ternarylogic_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b, int imm8)
#include "immintrin.h"
Instruction: vpternlogd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] FOR h := 0 to 31 index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpternlogd
__m512i _mm512_maskz_ternarylogic_epi32 (__mmask16 k, __m512i a, __m512i b, __m512i c, int imm8)

Synopsis

__m512i _mm512_maskz_ternarylogic_epi32 (__mmask16 k, __m512i a, __m512i b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] FOR h := 0 to 31 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpternlogd
__m512i _mm512_ternarylogic_epi32 (__m512i a, __m512i b, __m512i c, int imm8)

Synopsis

__m512i _mm512_ternarylogic_epi32 (__m512i a, __m512i b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.

Operation

FOR j := 0 to 15 i := j*32 FOR h := 0 to 31 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ENDFOR dst[MAX:512] := 0
vpternlogq
__m128i _mm_mask_ternarylogic_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b, int imm8)

Synopsis

__m128i _mm_mask_ternarylogic_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vpternlogq
CPUID Flags: AVX512VL + AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] FOR h := 0 to 63 index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpternlogq
__m128i _mm_maskz_ternarylogic_epi64 (__mmask8 k, __m128i a, __m128i b, __m128i c, int imm8)

Synopsis

__m128i _mm_maskz_ternarylogic_epi64 (__mmask8 k, __m128i a, __m128i b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogq
CPUID Flags: AVX512VL + AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] FOR h := 0 to 63 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpternlogq
__m128i _mm_ternarylogic_epi64 (__m128i a, __m128i b, __m128i c, int imm8)

Synopsis

__m128i _mm_ternarylogic_epi64 (__m128i a, __m128i b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogq
CPUID Flags: AVX512VL + AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.

Operation

FOR j := 0 to 1 i := j*64 FOR h := 0 to 63 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ENDFOR dst[MAX:128] := 0
vpternlogq
__m256i _mm256_mask_ternarylogic_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b, int imm8)

Synopsis

__m256i _mm256_mask_ternarylogic_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vpternlogq
CPUID Flags: AVX512VL + AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] FOR h := 0 to 63 index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpternlogq
__m256i _mm256_maskz_ternarylogic_epi64 (__mmask8 k, __m256i a, __m256i b, __m256i c, int imm8)

Synopsis

__m256i _mm256_maskz_ternarylogic_epi64 (__mmask8 k, __m256i a, __m256i b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogq
CPUID Flags: AVX512VL + AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] FOR h := 0 to 63 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpternlogq
__m256i _mm256_ternarylogic_epi64 (__m256i a, __m256i b, __m256i c, int imm8)

Synopsis

__m256i _mm256_ternarylogic_epi64 (__m256i a, __m256i b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogq
CPUID Flags: AVX512VL + AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.

Operation

FOR j := 0 to 3 i := j*64 FOR h := 0 to 63 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ENDFOR dst[MAX:256] := 0
vpternlogq
__m512i _mm512_mask_ternarylogic_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b, int imm8)

Synopsis

__m512i _mm512_mask_ternarylogic_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b, int imm8)
#include "immintrin.h"
Instruction: vpternlogq zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] FOR h := 0 to 63 index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpternlogq
__m512i _mm512_maskz_ternarylogic_epi64 (__mmask8 k, __m512i a, __m512i b, __m512i c, int imm8)

Synopsis

__m512i _mm512_maskz_ternarylogic_epi64 (__mmask8 k, __m512i a, __m512i b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogq zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] FOR h := 0 to 63 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpternlogq
__m512i _mm512_ternarylogic_epi64 (__m512i a, __m512i b, __m512i c, int imm8)

Synopsis

__m512i _mm512_ternarylogic_epi64 (__m512i a, __m512i b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogq zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F

Description

Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.

Operation

FOR j := 0 to 7 i := j*64 FOR h := 0 to 63 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ENDFOR dst[MAX:512] := 0
...
int _mm_test_all_ones (__m128i a)

Synopsis

int _mm_test_all_ones (__m128i a)
#include "smmintrin.h"
Instruction: pcmpeqd xmm, xmm
             ptest xmm, xmm
CPUID Flags: SSE4.1

Description

Compute the complement of a and 0xFFFFFFFF, and return 1 if the result is zero, otherwise return 0.

Operation

IF (a[127:0] AND NOT 0xFFFFFFFF == 0) CF := 1 ELSE CF := 0 FI RETURN CF
ptest
int _mm_test_all_zeros (__m128i a, __m128i mask)

Synopsis

int _mm_test_all_zeros (__m128i a, __m128i mask)
#include "smmintrin.h"
Instruction: ptest xmm, xmm
CPUID Flags: SSE4.1

Description

Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and return 1 if the result is zero, otherwise return 0.

Operation

IF (a[127:0] AND mask[127:0] == 0) ZF := 1 ELSE ZF := 0 FI RETURN ZF

Performance

ArchitectureLatencyThroughput
Haswell21
Ivy Bridge21
Sandy Bridge21
Westmere21
Nehalem21
vptestmw
__mmask8 _mm_mask_test_epi16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_test_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vptestmw
__mmask8 _mm_test_epi16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_test_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 ENDFOR k[MAX:8] := 0
vptestmw
__mmask16 _mm256_mask_test_epi16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_test_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vptestmw
__mmask16 _mm256_test_epi16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_test_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 ENDFOR k[MAX:16] := 0
vptestmw
__mmask32 _mm512_mask_test_epi16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_test_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmw
CPUID Flags: AVX512BW

Description

Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vptestmw
__mmask32 _mm512_test_epi16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_test_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmw
CPUID Flags: AVX512BW

Description

Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 ENDFOR k[MAX:32] := 0
vptestmd
__mmask8 _mm_mask_test_epi32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_test_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vptestmd
__mmask8 _mm_test_epi32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_test_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ENDFOR k[MAX:4] := 0
vptestmd
__mmask8 _mm256_mask_test_epi32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_test_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vptestmd
__mmask8 _mm256_test_epi32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_test_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ENDFOR k[MAX:8] := 0
vptestmd
__mmask16 _mm512_mask_test_epi32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_test_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmd k {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vptestmd
__mmask16 _mm512_test_epi32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_test_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmd k {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ENDFOR k[MAX:16] := 0
vptestmq
__mmask8 _mm_mask_test_epi64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_test_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vptestmq
__mmask8 _mm_test_epi64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_test_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ENDFOR k[MAX:2] := 0
vptestmq
__mmask8 _mm256_mask_test_epi64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_test_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vptestmq
__mmask8 _mm256_test_epi64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_test_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ENDFOR k[MAX:4] := 0
vptestmq
__mmask8 _mm512_mask_test_epi64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_test_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmq k {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vptestmq
__mmask8 _mm512_test_epi64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_test_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmq k {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ENDFOR k[MAX:8] := 0
vptestmb
__mmask16 _mm_mask_test_epi8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_test_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmb
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vptestmb
__mmask16 _mm_test_epi8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_test_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmb
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 ENDFOR k[MAX:16] := 0
vptestmb
__mmask32 _mm256_mask_test_epi8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_test_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmb
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vptestmb
__mmask32 _mm256_test_epi8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_test_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmb
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 ENDFOR k[MAX:32] := 0
vptestmb
__mmask64 _mm512_mask_test_epi8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_test_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmb
CPUID Flags: AVX512BW

Description

Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
vptestmb
__mmask64 _mm512_test_epi8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_test_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmb
CPUID Flags: AVX512BW

Description

Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 ENDFOR k[MAX:64] := 0
ptest
int _mm_test_mix_ones_zeros (__m128i a, __m128i mask)

Synopsis

int _mm_test_mix_ones_zeros (__m128i a, __m128i mask)
#include "smmintrin.h"
Instruction: ptest xmm, xmm
CPUID Flags: SSE4.1

Description

Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.

Operation

IF (a[127:0] AND mask[127:0] == 0) ZF := 1 ELSE ZF := 0 FI IF (a[127:0] AND NOT mask[127:0] == 0) CF := 1 ELSE CF := 0 FI IF (ZF == 0 && CF == 0) RETURN 1 ELSE RETURN 0 FI

Performance

ArchitectureLatencyThroughput
Haswell21
Ivy Bridge21
Sandy Bridge21
Westmere21
Nehalem21
vtestpd
int _mm_testc_pd (__m128d a, __m128d b)

Synopsis

int _mm_testc_pd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vtestpd xmm, xmm
CPUID Flags: AVX

Description

Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in a and b, producing an intermediate 128-bit value, and set ZF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set CF to 0. Return the CF value.

Operation

tmp[127:0] := a[127:0] AND b[127:0] IF (tmp[63] == tmp[127] == 0) ZF := 1 ELSE ZF := 0 FI tmp[127:0] := a[127:0] AND NOT b[127:0] IF (tmp[63] == tmp[127] == 0) CF := 1 ELSE CF := 0 FI RETURN CF

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vtestpd
int _mm256_testc_pd (__m256d a, __m256d b)

Synopsis

int _mm256_testc_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vtestpd ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in a and b, producing an intermediate 256-bit value, and set ZF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set CF to 0. Return the CF value.

Operation

tmp[255:0] := a[255:0] AND b[255:0] IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0) ZF := 1 ELSE ZF := 0 FI tmp[255:0] := a[255:0] AND NOT b[255:0] IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0) CF := 1 ELSE CF := 0 FI RETURN CF

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge11
Sandy Bridge11
vtestps
int _mm_testc_ps (__m128 a, __m128 b)

Synopsis

int _mm_testc_ps (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vtestps xmm, xmm
CPUID Flags: AVX

Description

Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in a and b, producing an intermediate 128-bit value, and set ZF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set CF to 0. Return the CF value.

Operation

tmp[127:0] := a[127:0] AND b[127:0] IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0) ZF := 1 ELSE ZF := 0 FI tmp[127:0] := a[127:0] AND NOT b[127:0] IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0) CF := 1 ELSE CF := 0 FI RETURN CF

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vtestps
int _mm256_testc_ps (__m256 a, __m256 b)

Synopsis

int _mm256_testc_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vtestps ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in a and b, producing an intermediate 256-bit value, and set ZF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set CF to 0. Return the CF value.

Operation

tmp[255:0] := a[255:0] AND b[255:0] IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0) ZF := 1 ELSE ZF := 0 FI tmp[255:0] := a[255:0] AND NOT b[255:0] IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0) CF := 1 ELSE CF := 0 FI RETURN CF

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge11
Sandy Bridge11
ptest
int _mm_testc_si128 (__m128i a, __m128i b)

Synopsis

int _mm_testc_si128 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: ptest xmm, xmm
CPUID Flags: SSE4.1

Description

Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the CF value.

Operation

IF (a[127:0] AND b[127:0] == 0) ZF := 1 ELSE ZF := 0 FI IF (a[127:0] AND NOT b[127:0] == 0) CF := 1 ELSE CF := 0 FI RETURN CF

Performance

ArchitectureLatencyThroughput
Haswell21
Ivy Bridge21
Sandy Bridge21
Westmere21
Nehalem21
vptest
int _mm256_testc_si256 (__m256i a, __m256i b)

Synopsis

int _mm256_testc_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptest ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of 256 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the CF value.

Operation

IF (a[255:0] AND b[255:0] == 0) ZF := 1 ELSE ZF := 0 FI IF (a[255:0] AND NOT b[255:0] == 0) CF := 1 ELSE CF := 0 FI RETURN CF

Performance

ArchitectureLatencyThroughput
Haswell4-
Ivy Bridge2-
Sandy Bridge2-
vptestnmw
__mmask8 _mm_mask_testn_epi16_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_testn_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.

Operation

FOR j := 0 to 7 i := j*16 IF k1[j] k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vptestnmw
__mmask8 _mm_testn_epi16_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_testn_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.

Operation

FOR j := 0 to 7 i := j*16 k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 ENDFOR k[MAX:8] := 0
vptestnmw
__mmask16 _mm256_mask_testn_epi16_mask (__mmask16 k1, __m256i a, __m256i b)

Synopsis

__mmask16 _mm256_mask_testn_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.

Operation

FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vptestnmw
__mmask16 _mm256_testn_epi16_mask (__m256i a, __m256i b)

Synopsis

__mmask16 _mm256_testn_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmw
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.

Operation

FOR j := 0 to 15 i := j*16 k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 ENDFOR k[MAX:16] := 0
vptestnmw
__mmask32 _mm512_mask_testn_epi16_mask (__mmask32 k1, __m512i a, __m512i b)

Synopsis

__mmask32 _mm512_mask_testn_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmw
CPUID Flags: AVX512BW

Description

Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.

Operation

FOR j := 0 to 31 i := j*16 IF k1[j] k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vptestnmw
__mmask32 _mm512_testn_epi16_mask (__m512i a, __m512i b)

Synopsis

__mmask32 _mm512_testn_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmw
CPUID Flags: AVX512BW

Description

Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.

Operation

FOR j := 0 to 31 i := j*16 k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 ENDFOR k[MAX:32] := 0
vptestnmd
__mmask8 _mm_mask_testn_epi32_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_testn_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.

Operation

FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vptestnmd
__mmask8 _mm_testn_epi32_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_testn_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.

Operation

FOR j := 0 to 3 i := j*32 k[j] := ((a[i+31:i] NAND b[i+31:i]) == 0) ? 1 : 0 ENDFOR k[MAX:4] := 0
vptestnmd
__mmask8 _mm256_mask_testn_epi32_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_testn_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.

Operation

FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vptestnmd
__mmask8 _mm256_testn_epi32_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_testn_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmd
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.

Operation

FOR j := 0 to 7 i := j*32 k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ENDFOR k[MAX:8] := 0
vptestnmd
__mmask16 _mm512_mask_testn_epi32_mask (__mmask16 k1, __m512i a, __m512i b)

Synopsis

__mmask16 _mm512_mask_testn_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmd k {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.

Operation

FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vptestnmd
__mmask16 _mm512_testn_epi32_mask (__m512i a, __m512i b)

Synopsis

__mmask16 _mm512_testn_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmd k {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.

Operation

FOR j := 0 to 15 i := j*32 k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ENDFOR k[MAX:16] := 0
vptestnmq
__mmask8 _mm_mask_testn_epi64_mask (__mmask8 k1, __m128i a, __m128i b)

Synopsis

__mmask8 _mm_mask_testn_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.

Operation

FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
vptestnmq
__mmask8 _mm_testn_epi64_mask (__m128i a, __m128i b)

Synopsis

__mmask8 _mm_testn_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.

Operation

FOR j := 0 to 1 i := j*64 k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ENDFOR k[MAX:2] := 0
vptestnmq
__mmask8 _mm256_mask_testn_epi64_mask (__mmask8 k1, __m256i a, __m256i b)

Synopsis

__mmask8 _mm256_mask_testn_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.

Operation

FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
vptestnmq
__mmask8 _mm256_testn_epi64_mask (__m256i a, __m256i b)

Synopsis

__mmask8 _mm256_testn_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.

Operation

FOR j := 0 to 3 i := j*64 k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ENDFOR k[MAX:4] := 0
vptestnmq
__mmask8 _mm512_mask_testn_epi64_mask (__mmask8 k1, __m512i a, __m512i b)

Synopsis

__mmask8 _mm512_mask_testn_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmq k {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.

Operation

FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
vptestnmq
__mmask8 _mm512_testn_epi64_mask (__m512i a, __m512i b)

Synopsis

__mmask8 _mm512_testn_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmq k {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.

Operation

FOR j := 0 to 7 i := j*64 k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ENDFOR k[MAX:8] := 0
vptestnmb
__mmask16 _mm_mask_testn_epi8_mask (__mmask16 k1, __m128i a, __m128i b)

Synopsis

__mmask16 _mm_mask_testn_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmb
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.

Operation

FOR j := 0 to 15 i := j*8 IF k1[j] k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
vptestnmb
__mmask16 _mm_testn_epi8_mask (__m128i a, __m128i b)

Synopsis

__mmask16 _mm_testn_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmb
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.

Operation

FOR j := 0 to 15 i := j*8 k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 ENDFOR k[MAX:16] := 0
vptestnmb
__mmask32 _mm256_mask_testn_epi8_mask (__mmask32 k1, __m256i a, __m256i b)

Synopsis

__mmask32 _mm256_mask_testn_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmb
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.

Operation

FOR j := 0 to 31 i := j*8 IF k1[j] k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0
vptestnmb
__mmask32 _mm256_testn_epi8_mask (__m256i a, __m256i b)

Synopsis

__mmask32 _mm256_testn_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmb
CPUID Flags: AVX512VL + AVX512BW

Description

Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.

Operation

FOR j := 0 to 31 i := j*8 k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 ENDFOR k[MAX:32] := 0
vptestnmb
__mmask64 _mm512_mask_testn_epi8_mask (__mmask64 k1, __m512i a, __m512i b)

Synopsis

__mmask64 _mm512_mask_testn_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmb
CPUID Flags: AVX512BW

Description

Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.

Operation

FOR j := 0 to 63 i := j*8 IF k1[j] k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0
vptestnmb
__mmask64 _mm512_testn_epi8_mask (__m512i a, __m512i b)

Synopsis

__mmask64 _mm512_testn_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmb
CPUID Flags: AVX512BW

Description

Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.

Operation

FOR j := 0 to 63 i := j*8 k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 ENDFOR k[MAX:64] := 0
vtestpd
int _mm_testnzc_pd (__m128d a, __m128d b)

Synopsis

int _mm_testnzc_pd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vtestpd xmm, xmm
CPUID Flags: AVX

Description

Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in a and b, producing an intermediate 128-bit value, and set ZF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.

Operation

tmp[127:0] := a[127:0] AND b[127:0] IF (tmp[63] == tmp[127] == 0) ZF := 1 ELSE ZF := 0 FI tmp[127:0] := a[127:0] AND NOT b[127:0] IF (tmp[63] == tmp[127] == 0) CF := 1 ELSE CF := 0 FI IF (ZF == 0 && CF == 0) RETURN 1 ELSE RETURN 0 FI

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vtestpd
int _mm256_testnzc_pd (__m256d a, __m256d b)

Synopsis

int _mm256_testnzc_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vtestpd ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in a and b, producing an intermediate 256-bit value, and set ZF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.

Operation

tmp[255:0] := a[255:0] AND b[255:0] IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0) ZF := 1 ELSE ZF := 0 FI tmp[255:0] := a[255:0] AND NOT b[255:0] IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0) CF := 1 ELSE CF := 0 FI IF (ZF == 0 && CF == 0) RETURN 1 ELSE RETURN 0 FI

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge11
Sandy Bridge11
vtestps
int _mm_testnzc_ps (__m128 a, __m128 b)

Synopsis

int _mm_testnzc_ps (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vtestps xmm, xmm
CPUID Flags: AVX

Description

Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in a and b, producing an intermediate 128-bit value, and set ZF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.

Operation

tmp[127:0] := a[127:0] AND b[127:0] IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0) ZF := 1 ELSE ZF := 0 FI tmp[127:0] := a[127:0] AND NOT b[127:0] IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0) CF := 1 ELSE CF := 0 FI IF (ZF == 0 && CF == 0) RETURN 1 ELSE RETURN 0 FI

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vtestps
int _mm256_testnzc_ps (__m256 a, __m256 b)

Synopsis

int _mm256_testnzc_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vtestps ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in a and b, producing an intermediate 256-bit value, and set ZF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.

Operation

tmp[255:0] := a[255:0] AND b[255:0] IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0) ZF := 1 ELSE ZF := 0 FI tmp[255:0] := a[255:0] AND NOT b[255:0] IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0) CF := 1 ELSE CF := 0 FI IF (ZF == 0 && CF == 0) RETURN 1 ELSE RETURN 0 FI

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge11
Sandy Bridge11
ptest
int _mm_testnzc_si128 (__m128i a, __m128i b)

Synopsis

int _mm_testnzc_si128 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: ptest xmm, xmm
CPUID Flags: SSE4.1

Description

Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.

Operation

IF (a[127:0] AND b[127:0] == 0) ZF := 1 ELSE ZF := 0 FI IF (a[127:0] AND NOT b[127:0] == 0) CF := 1 ELSE CF := 0 FI IF (ZF == 0 && CF == 0) RETURN 1 ELSE RETURN 0 FI

Performance

ArchitectureLatencyThroughput
Haswell21
Ivy Bridge21
Sandy Bridge21
Westmere21
Nehalem21
vptest
int _mm256_testnzc_si256 (__m256i a, __m256i b)

Synopsis

int _mm256_testnzc_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptest ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of 256 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.

Operation

IF (a[255:0] AND b[255:0] == 0) ZF := 1 ELSE ZF := 0 FI IF (a[255:0] AND NOT b[255:0] == 0) CF := 1 ELSE CF := 0 FI IF (ZF == 0 && CF == 0) RETURN 1 ELSE RETURN 0 FI

Performance

ArchitectureLatencyThroughput
Haswell4-
Ivy Bridge2-
Sandy Bridge2-
vtestpd
int _mm_testz_pd (__m128d a, __m128d b)

Synopsis

int _mm_testz_pd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vtestpd xmm, xmm
CPUID Flags: AVX

Description

Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in a and b, producing an intermediate 128-bit value, and set ZF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set CF to 0. Return the ZF value.

Operation

tmp[127:0] := a[127:0] AND b[127:0] IF (tmp[63] == tmp[127] == 0) ZF := 1 ELSE ZF := 0 FI tmp[127:0] := a[127:0] AND NOT b[127:0] IF (tmp[63] == tmp[127] == 0) CF := 1 ELSE CF := 0 FI RETURN ZF

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vtestpd
int _mm256_testz_pd (__m256d a, __m256d b)

Synopsis

int _mm256_testz_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vtestpd ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in a and b, producing an intermediate 256-bit value, and set ZF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set CF to 0. Return the ZF value.

Operation

tmp[255:0] := a[255:0] AND b[255:0] IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0) ZF := 1 ELSE ZF := 0 FI tmp[255:0] := a[255:0] AND NOT b[255:0] IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0) CF := 1 ELSE CF := 0 FI RETURN ZF

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge11
Sandy Bridge11
vtestps
int _mm_testz_ps (__m128 a, __m128 b)

Synopsis

int _mm_testz_ps (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vtestps xmm, xmm
CPUID Flags: AVX

Description

Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in a and b, producing an intermediate 128-bit value, and set ZF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set CF to 0. Return the ZF value.

Operation

tmp[127:0] := a[127:0] AND b[127:0] IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0) ZF := 1 ELSE ZF := 0 FI tmp[127:0] := a[127:0] AND NOT b[127:0] IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0) CF := 1 ELSE CF := 0 FI RETURN ZF

Performance

ArchitectureLatencyThroughput
Haswell3-
Ivy Bridge1-
Sandy Bridge1-
vtestps
int _mm256_testz_ps (__m256 a, __m256 b)

Synopsis

int _mm256_testz_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vtestps ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in a and b, producing an intermediate 256-bit value, and set ZF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set CF to 0. Return the ZF value.

Operation

tmp[255:0] := a[255:0] AND b[255:0] IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0) ZF := 1 ELSE ZF := 0 FI tmp[255:0] := a[255:0] AND NOT b[255:0] IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0) CF := 1 ELSE CF := 0 FI RETURN ZF

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge11
Sandy Bridge11
ptest
int _mm_testz_si128 (__m128i a, __m128i b)

Synopsis

int _mm_testz_si128 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: ptest xmm, xmm
CPUID Flags: SSE4.1

Description

Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the ZF value.

Operation

IF (a[127:0] AND b[127:0] == 0) ZF := 1 ELSE ZF := 0 FI IF (a[127:0] AND NOT b[127:0] == 0) CF := 1 ELSE CF := 0 FI RETURN ZF

Performance

ArchitectureLatencyThroughput
Haswell21
Ivy Bridge21
Sandy Bridge21
Westmere21
Nehalem21
vptest
int _mm256_testz_si256 (__m256i a, __m256i b)

Synopsis

int _mm256_testz_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptest ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise AND of 256 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the ZF value.

Operation

IF (a[255:0] AND b[255:0] == 0) ZF := 1 ELSE ZF := 0 FI IF (a[255:0] AND NOT b[255:0] == 0) CF := 1 ELSE CF := 0 FI RETURN ZF

Performance

ArchitectureLatencyThroughput
Haswell4-
Ivy Bridge2-
Sandy Bridge2-
...
_MM_TRANSPOSE4_PS (__m128 row0, __m128 row1, __m128 row2, __m128 row3)

Synopsis

_MM_TRANSPOSE4_PS (__m128 row0, __m128 row1, __m128 row2, __m128 row3)
#include "xmmintrin.h"
CPUID Flags: SSE

Description

Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.).

Operation

__m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps(row0, row1); tmp2 = _mm_unpacklo_ps(row2, row3); tmp1 = _mm_unpackhi_ps(row0, row1); tmp3 = _mm_unpackhi_ps(row2, row3); row0 = _mm_movelh_ps(tmp0, tmp2); row1 = _mm_movehl_ps(tmp2, tmp0); row2 = _mm_movelh_ps(tmp1, tmp3); row3 = _mm_movehl_ps(tmp3, tmp1);
...
__m128d _mm_trunc_pd (__m128d a)

Synopsis

__m128d _mm_trunc_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Truncate the packed double-precision (64-bit) floating-point elements in a, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := TRUNCATE(a[i+63:i]) ENDFOR dst[MAX:128] := 0
...
__m256d _mm256_trunc_pd (__m256d a)

Synopsis

__m256d _mm256_trunc_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Truncate the packed double-precision (64-bit) floating-point elements in a, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := TRUNCATE(a[i+63:i]) ENDFOR dst[MAX:256] := 0
...
__m512d _mm512_mask_trunc_pd (__m512d src, __mmask8 k, __m512d a)

Synopsis

__m512d _mm512_mask_trunc_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Truncate the packed double-precision (64-bit) floating-point elements in a, and store the results as packed double-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := TRUNCATE(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
...
__m512d _mm512_trunc_pd (__m512d a)

Synopsis

__m512d _mm512_trunc_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Truncate the packed double-precision (64-bit) floating-point elements in a, and store the results as packed double-precision floating-point elements in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := TRUNCATE(a[i+63:i]) ENDFOR dst[MAX:512] := 0
...
__m128 _mm_trunc_ps (__m128 a)

Synopsis

__m128 _mm_trunc_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE

Description

Truncate the packed single-precision (32-bit) floating-point elements in a, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := TRUNCATE(a[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256 _mm256_trunc_ps (__m256 a)

Synopsis

__m256 _mm256_trunc_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX

Description

Truncate the packed single-precision (32-bit) floating-point elements in a, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := TRUNCATE(a[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m512 _mm512_mask_trunc_ps (__m512 src, __mmask16 k, __m512 a)

Synopsis

__m512 _mm512_mask_trunc_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Truncate the packed single-precision (32-bit) floating-point elements in a, and store the results as packed single-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := TRUNCATE(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
...
__m512 _mm512_trunc_ps (__m512 a)

Synopsis

__m512 _mm512_trunc_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Truncate the packed single-precision (32-bit) floating-point elements in a, and store the results as packed single-precision floating-point elements in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := TRUNCATE(a[i+31:i]) ENDFOR dst[MAX:512] := 0
tzcnt
int _mm_tzcnt_32 (unsigned int a)

Synopsis

int _mm_tzcnt_32 (unsigned int a)
#include "immintrin.h"
Instruction: tzcnt r32, r32
CPUID Flags: BMI1

Description

Count the number of trailing zero bits in unsigned 32-bit integer a, and return that count in dst.

Operation

tmp := 0 dst := 0 DO WHILE ((tmp < 32) AND a[tmp] = 0) tmp := tmp + 1 dst := dst + 1 OD
tzcnt
__int64 _mm_tzcnt_64 (unsigned __int64 a)

Synopsis

__int64 _mm_tzcnt_64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: tzcnt r64, r64
CPUID Flags: BMI1

Description

Count the number of trailing zero bits in unsigned 64-bit integer a, and return that count in dst.

Operation

tmp := 0 dst := 0 DO WHILE ((tmp < 64) AND a[tmp] = 0) tmp := tmp + 1 dst := dst + 1 OD
tzcnt
unsigned int _tzcnt_u32 (unsigned int a)

Synopsis

unsigned int _tzcnt_u32 (unsigned int a)
#include "immintrin.h"
Instruction: tzcnt r32, r32
CPUID Flags: BMI1

Description

Count the number of trailing zero bits in unsigned 32-bit integer a, and return that count in dst.

Operation

tmp := 0 dst := 0 DO WHILE ((tmp < 32) AND a[tmp] = 0) tmp := tmp + 1 dst := dst + 1 OD

Performance

ArchitectureLatencyThroughput
Haswell3-
tzcnt
unsigned __int64 _tzcnt_u64 (unsigned __int64 a)

Synopsis

unsigned __int64 _tzcnt_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: tzcnt r64, r64
CPUID Flags: BMI1

Description

Count the number of trailing zero bits in unsigned 64-bit integer a, and return that count in dst.

Operation

tmp := 0 dst := 0 DO WHILE ((tmp < 64) AND a[tmp] = 0) tmp := tmp + 1 dst := dst + 1 OD

Performance

ArchitectureLatencyThroughput
Haswell3-
tzcnti
int _mm_tzcnti_32 (int a, unsigned int x)

Synopsis

int _mm_tzcnti_32 (int a, unsigned int x)
#include "immintrin.h"
Instruction: tzcnti r32, r32
CPUID Flags: KNCNI

Description

Counts the number of trailing bits in unsigned 32-bit integer x starting at bit a storing the result in dst.

Operation

count := 0 FOR j := a to 31 IF NOT(x[j] 1) count := count + 1 FI ENDFOR dst := count
tzcnti
__int64 _mm_tzcnti_64 (__int64 a, unsigned __int64 x)

Synopsis

__int64 _mm_tzcnti_64 (__int64 a, unsigned __int64 x)
#include "immintrin.h"
Instruction: tzcnti r64, r64
CPUID Flags: KNCNI

Description

Counts the number of trailing bits in unsigned 64-bit integer x starting at bit a storing the result in dst.

Operation

count := 0 FOR j := a to 63 IF NOT(x[j] 1) count := count + 1 FI ENDFOR dst := count
ucomisd
int _mm_ucomieq_sd (__m128d a, __m128d b)

Synopsis

int _mm_ucomieq_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: ucomisd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point element in a and b for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

Operation

RETURN ( a[63:0] == b[63:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
ucomiss
int _mm_ucomieq_ss (__m128 a, __m128 b)

Synopsis

int _mm_ucomieq_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: ucomiss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point element in a and b for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

Operation

RETURN ( a[31:0] == b[31:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
ucomisd
int _mm_ucomige_sd (__m128d a, __m128d b)

Synopsis

int _mm_ucomige_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: ucomisd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point element in a and b for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

Operation

RETURN ( a[63:0] >= b[63:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
ucomiss
int _mm_ucomige_ss (__m128 a, __m128 b)

Synopsis

int _mm_ucomige_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: ucomiss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point element in a and b for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

Operation

RETURN ( a[31:0] >= b[31:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
ucomisd
int _mm_ucomigt_sd (__m128d a, __m128d b)

Synopsis

int _mm_ucomigt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: ucomisd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point element in a and b for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

Operation

RETURN ( a[63:0] > b[63:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
ucomiss
int _mm_ucomigt_ss (__m128 a, __m128 b)

Synopsis

int _mm_ucomigt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: ucomiss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point element in a and b for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

Operation

RETURN ( a[31:0] > b[31:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
ucomisd
int _mm_ucomile_sd (__m128d a, __m128d b)

Synopsis

int _mm_ucomile_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: ucomisd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point element in a and b for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

Operation

RETURN ( a[63:0] <= b[63:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
ucomiss
int _mm_ucomile_ss (__m128 a, __m128 b)

Synopsis

int _mm_ucomile_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: ucomiss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point element in a and b for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

Operation

RETURN ( a[31:0] <= b[31:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
ucomisd
int _mm_ucomilt_sd (__m128d a, __m128d b)

Synopsis

int _mm_ucomilt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: ucomisd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point element in a and b for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

Operation

RETURN ( a[63:0] < b[63:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
ucomiss
int _mm_ucomilt_ss (__m128 a, __m128 b)

Synopsis

int _mm_ucomilt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: ucomiss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point element in a and b for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

Operation

RETURN ( a[31:0] < b[31:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
ucomisd
int _mm_ucomineq_sd (__m128d a, __m128d b)

Synopsis

int _mm_ucomineq_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: ucomisd xmm, xmm
CPUID Flags: SSE2

Description

Compare the lower double-precision (64-bit) floating-point element in a and b for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

Operation

RETURN ( a[63:0] != b[63:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
Westmere11
Nehalem11
ucomiss
int _mm_ucomineq_ss (__m128 a, __m128 b)

Synopsis

int _mm_ucomineq_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: ucomiss xmm, xmm
CPUID Flags: SSE

Description

Compare the lower single-precision (32-bit) floating-point element in a and b for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

Operation

RETURN ( a[31:0] != b[31:0] ) ? 1 : 0

Performance

ArchitectureLatencyThroughput
Haswell31
Ivy Bridge21
Sandy Bridge21
...
__m128i _mm_udiv_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_udiv_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_udiv_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_udiv_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, and store the truncated results in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:256] := 0
...
__m128i _mm_udivrem_epi32 (__m128i * mem_addr, __m128i a, __m128i b)

Synopsis

__m128i _mm_udivrem_epi32 (__m128i * mem_addr, __m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, store the truncated results in dst, and store the remainders as packed unsigned 32-bit integers into memory at mem_addr.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_udivrem_epi32 (__m256i * mem_addr, __m256i a, __m256i b)

Synopsis

__m256i _mm256_udivrem_epi32 (__m256i * mem_addr, __m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, store the truncated results in dst, and store the remainders as packed unsigned 32-bit integers into memory at mem_addr.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:256] := 0
__m512 _mm512_undefined (void)

Synopsis

__m512 _mm512_undefined (void)
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Return vector of type __m512 with undefined elements.
__m512i _mm512_undefined_epi32 ()

Synopsis

__m512i _mm512_undefined_epi32 ()
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Return vector of type __m512i with undefined elements.
__m128d _mm_undefined_pd (void)

Synopsis

__m128d _mm_undefined_pd (void)
#include "immintrin.h"
CPUID Flags: AVX

Description

Return vector of type __m128d with undefined elements.
__m256d _mm256_undefined_pd (void)

Synopsis

__m256d _mm256_undefined_pd (void)
#include "immintrin.h"
CPUID Flags: AVX

Description

Return vector of type __m256d with undefined elements.
__m512d _mm512_undefined_pd ()

Synopsis

__m512d _mm512_undefined_pd ()
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Return vector of type __m512d with undefined elements.
__m128 _mm_undefined_ps (void)

Synopsis

__m128 _mm_undefined_ps (void)
#include "immintrin.h"
CPUID Flags: AVX

Description

Return vector of type __m128 with undefined elements.
__m256 _mm256_undefined_ps (void)

Synopsis

__m256 _mm256_undefined_ps (void)
#include "immintrin.h"
CPUID Flags: AVX

Description

Return vector of type __m256 with undefined elements.
__m512 _mm512_undefined_ps ()

Synopsis

__m512 _mm512_undefined_ps ()
#include "immintrin.h"
CPUID Flags: AVX512F

Description

Return vector of type __m512 with undefined elements.
__m128i _mm_undefined_si128 (void)

Synopsis

__m128i _mm_undefined_si128 (void)
#include "immintrin.h"
CPUID Flags: AVX

Description

Return vector of type __m128i with undefined elements.
__m256i _mm256_undefined_si256 (void)

Synopsis

__m256i _mm256_undefined_si256 (void)
#include "immintrin.h"
CPUID Flags: AVX

Description

Return vector of type __m256i with undefined elements.
vpunpckhwd
__m128i _mm_mask_unpackhi_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_unpackhi_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 16-bit integers from the high half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[79:64] dst[31:16] := src2[79:64] dst[47:32] := src1[95:80] dst[63:48] := src2[95:80] dst[79:64] := src1[111:96] dst[95:80] := src2[111:96] dst[111:96] := src1[127:112] dst[127:112] := src2[127:112] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpunpckhwd
__m128i _mm_maskz_unpackhi_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_unpackhi_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 16-bit integers from the high half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[79:64] dst[31:16] := src2[79:64] dst[47:32] := src1[95:80] dst[63:48] := src2[95:80] dst[79:64] := src1[111:96] dst[95:80] := src2[111:96] dst[111:96] := src1[127:112] dst[127:112] := src2[127:112] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
punpckhwd
__m128i _mm_unpackhi_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_unpackhi_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpckhwd xmm, xmm
CPUID Flags: SSE2

Description

Unpack and interleave 16-bit integers from the high half of a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[79:64] dst[31:16] := src2[79:64] dst[47:32] := src1[95:80] dst[63:48] := src2[95:80] dst[79:64] := src1[111:96] dst[95:80] := src2[111:96] dst[111:96] := src1[127:112] dst[127:112] := src2[127:112] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpunpckhwd
__m256i _mm256_mask_unpackhi_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_unpackhi_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[79:64] dst[31:16] := src2[79:64] dst[47:32] := src1[95:80] dst[63:48] := src2[95:80] dst[79:64] := src1[111:96] dst[95:80] := src2[111:96] dst[111:96] := src1[127:112] dst[127:112] := src2[127:112] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpunpckhwd
__m256i _mm256_maskz_unpackhi_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_unpackhi_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[79:64] dst[31:16] := src2[79:64] dst[47:32] := src1[95:80] dst[63:48] := src2[95:80] dst[79:64] := src1[111:96] dst[95:80] := src2[111:96] dst[111:96] := src1[127:112] dst[127:112] := src2[127:112] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpunpckhwd
__m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhwd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[79:64] dst[31:16] := src2[79:64] dst[47:32] := src1[95:80] dst[63:48] := src2[95:80] dst[79:64] := src1[111:96] dst[95:80] := src2[111:96] dst[111:96] := src1[127:112] dst[127:112] := src2[127:112] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpunpckhwd
__m512i _mm512_mask_unpackhi_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_unpackhi_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512BW

Description

Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[79:64] dst[31:16] := src2[79:64] dst[47:32] := src1[95:80] dst[63:48] := src2[95:80] dst[79:64] := src1[111:96] dst[95:80] := src2[111:96] dst[111:96] := src1[127:112] dst[127:112] := src2[127:112] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpunpckhwd
__m512i _mm512_maskz_unpackhi_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_unpackhi_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512BW

Description

Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[79:64] dst[31:16] := src2[79:64] dst[47:32] := src1[95:80] dst[63:48] := src2[95:80] dst[79:64] := src1[111:96] dst[95:80] := src2[111:96] dst[111:96] := src1[127:112] dst[127:112] := src2[127:112] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpunpckhwd
__m512i _mm512_unpackhi_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_unpackhi_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512BW

Description

Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[79:64] dst[31:16] := src2[79:64] dst[47:32] := src1[95:80] dst[63:48] := src2[95:80] dst[79:64] := src1[111:96] dst[95:80] := src2[111:96] dst[111:96] := src1[127:112] dst[127:112] := src2[127:112] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
vpunpckhdq
__m128i _mm_mask_unpackhi_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_unpackhi_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhdq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 32-bit integers from the high half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpunpckhdq
__m128i _mm_maskz_unpackhi_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_unpackhi_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhdq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 32-bit integers from the high half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
punpckhdq
__m128i _mm_unpackhi_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_unpackhi_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpckhdq xmm, xmm
CPUID Flags: SSE2

Description

Unpack and interleave 32-bit integers from the high half of a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpunpckhdq
__m256i _mm256_mask_unpackhi_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_unpackhi_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhdq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpunpckhdq
__m256i _mm256_maskz_unpackhi_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_unpackhi_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhdq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpunpckhdq
__m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhdq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpunpckhdq
__m512i _mm512_mask_unpackhi_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_unpackhi_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpunpckhdq
__m512i _mm512_maskz_unpackhi_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_unpackhi_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpunpckhdq
__m512i _mm512_unpackhi_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_unpackhi_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
vpunpckhqdq
__m128i _mm_mask_unpackhi_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_unpackhi_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhqdq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 64-bit integers from the high half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpunpckhqdq
__m128i _mm_maskz_unpackhi_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_unpackhi_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhqdq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 64-bit integers from the high half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
punpckhqdq
__m128i _mm_unpackhi_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_unpackhi_epi64 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpckhqdq xmm, xmm
CPUID Flags: SSE2

Description

Unpack and interleave 64-bit integers from the high half of a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpunpckhqdq
__m256i _mm256_mask_unpackhi_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_unpackhi_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhqdq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpunpckhqdq
__m256i _mm256_maskz_unpackhi_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_unpackhi_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhqdq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpunpckhqdq
__m256i _mm256_unpackhi_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_unpackhi_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhqdq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpunpckhqdq
__m512i _mm512_mask_unpackhi_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_unpackhi_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhqdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpunpckhqdq
__m512i _mm512_maskz_unpackhi_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_unpackhi_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhqdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpunpckhqdq
__m512i _mm512_unpackhi_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_unpackhi_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhqdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
vpunpckhbw
__m128i _mm_mask_unpackhi_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_unpackhi_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 8-bit integers from the high half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[71:64] dst[15:8] := src2[71:64] dst[23:16] := src1[79:72] dst[31:24] := src2[79:72] dst[39:32] := src1[87:80] dst[47:40] := src2[87:80] dst[55:48] := src1[95:88] dst[63:56] := src2[95:88] dst[71:64] := src1[103:96] dst[79:72] := src2[103:96] dst[87:80] := src1[111:104] dst[95:88] := src2[111:104] dst[103:96] := src1[119:112] dst[111:104] := src2[119:112] dst[119:112] := src1[127:120] dst[127:120] := src2[127:120] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpunpckhbw
__m128i _mm_maskz_unpackhi_epi8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_unpackhi_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 8-bit integers from the high half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[71:64] dst[15:8] := src2[71:64] dst[23:16] := src1[79:72] dst[31:24] := src2[79:72] dst[39:32] := src1[87:80] dst[47:40] := src2[87:80] dst[55:48] := src1[95:88] dst[63:56] := src2[95:88] dst[71:64] := src1[103:96] dst[79:72] := src2[103:96] dst[87:80] := src1[111:104] dst[95:88] := src2[111:104] dst[103:96] := src1[119:112] dst[111:104] := src2[119:112] dst[119:112] := src1[127:120] dst[127:120] := src2[127:120] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
punpckhbw
__m128i _mm_unpackhi_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_unpackhi_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpckhbw xmm, xmm
CPUID Flags: SSE2

Description

Unpack and interleave 8-bit integers from the high half of a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[71:64] dst[15:8] := src2[71:64] dst[23:16] := src1[79:72] dst[31:24] := src2[79:72] dst[39:32] := src1[87:80] dst[47:40] := src2[87:80] dst[55:48] := src1[95:88] dst[63:56] := src2[95:88] dst[71:64] := src1[103:96] dst[79:72] := src2[103:96] dst[87:80] := src1[111:104] dst[95:88] := src2[111:104] dst[103:96] := src1[119:112] dst[111:104] := src2[119:112] dst[119:112] := src1[127:120] dst[127:120] := src2[127:120] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpunpckhbw
__m256i _mm256_mask_unpackhi_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_unpackhi_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[71:64] dst[15:8] := src2[71:64] dst[23:16] := src1[79:72] dst[31:24] := src2[79:72] dst[39:32] := src1[87:80] dst[47:40] := src2[87:80] dst[55:48] := src1[95:88] dst[63:56] := src2[95:88] dst[71:64] := src1[103:96] dst[79:72] := src2[103:96] dst[87:80] := src1[111:104] dst[95:88] := src2[111:104] dst[103:96] := src1[119:112] dst[111:104] := src2[119:112] dst[119:112] := src1[127:120] dst[127:120] := src2[127:120] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpunpckhbw
__m256i _mm256_maskz_unpackhi_epi8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_unpackhi_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[71:64] dst[15:8] := src2[71:64] dst[23:16] := src1[79:72] dst[31:24] := src2[79:72] dst[39:32] := src1[87:80] dst[47:40] := src2[87:80] dst[55:48] := src1[95:88] dst[63:56] := src2[95:88] dst[71:64] := src1[103:96] dst[79:72] := src2[103:96] dst[87:80] := src1[111:104] dst[95:88] := src2[111:104] dst[103:96] := src1[119:112] dst[111:104] := src2[119:112] dst[119:112] := src1[127:120] dst[127:120] := src2[127:120] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpunpckhbw
__m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhbw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[71:64] dst[15:8] := src2[71:64] dst[23:16] := src1[79:72] dst[31:24] := src2[79:72] dst[39:32] := src1[87:80] dst[47:40] := src2[87:80] dst[55:48] := src1[95:88] dst[63:56] := src2[95:88] dst[71:64] := src1[103:96] dst[79:72] := src2[103:96] dst[87:80] := src1[111:104] dst[95:88] := src2[111:104] dst[103:96] := src1[119:112] dst[111:104] := src2[119:112] dst[119:112] := src1[127:120] dst[127:120] := src2[127:120] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpunpckhbw
__m512i _mm512_mask_unpackhi_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_unpackhi_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512BW

Description

Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[71:64] dst[15:8] := src2[71:64] dst[23:16] := src1[79:72] dst[31:24] := src2[79:72] dst[39:32] := src1[87:80] dst[47:40] := src2[87:80] dst[55:48] := src1[95:88] dst[63:56] := src2[95:88] dst[71:64] := src1[103:96] dst[79:72] := src2[103:96] dst[87:80] := src1[111:104] dst[95:88] := src2[111:104] dst[103:96] := src1[119:112] dst[111:104] := src2[119:112] dst[119:112] := src1[127:120] dst[127:120] := src2[127:120] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpunpckhbw
__m512i _mm512_maskz_unpackhi_epi8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_unpackhi_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512BW

Description

Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[71:64] dst[15:8] := src2[71:64] dst[23:16] := src1[79:72] dst[31:24] := src2[79:72] dst[39:32] := src1[87:80] dst[47:40] := src2[87:80] dst[55:48] := src1[95:88] dst[63:56] := src2[95:88] dst[71:64] := src1[103:96] dst[79:72] := src2[103:96] dst[87:80] := src1[111:104] dst[95:88] := src2[111:104] dst[103:96] := src1[119:112] dst[111:104] := src2[119:112] dst[119:112] := src1[127:120] dst[127:120] := src2[127:120] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpunpckhbw
__m512i _mm512_unpackhi_epi8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_unpackhi_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512BW

Description

Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[71:64] dst[15:8] := src2[71:64] dst[23:16] := src1[79:72] dst[31:24] := src2[79:72] dst[39:32] := src1[87:80] dst[47:40] := src2[87:80] dst[55:48] := src1[95:88] dst[63:56] := src2[95:88] dst[71:64] := src1[103:96] dst[79:72] := src2[103:96] dst[87:80] := src1[111:104] dst[95:88] := src2[111:104] dst[103:96] := src1[119:112] dst[111:104] := src2[119:112] dst[119:112] := src1[127:120] dst[127:120] := src2[127:120] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) dst[MAX:512] := 0
vunpckhpd
__m128d _mm_mask_unpackhi_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_unpackhi_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vunpckhpd
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the high half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vunpckhpd
__m128d _mm_maskz_unpackhi_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_unpackhi_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vunpckhpd
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the high half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
unpckhpd
__m128d _mm_unpackhi_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_unpackhi_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: unpckhpd xmm, xmm
CPUID Flags: SSE2

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the high half of a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vunpckhpd
__m256d _mm256_mask_unpackhi_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_unpackhi_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vunpckhpd
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vunpckhpd
__m256d _mm256_maskz_unpackhi_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_unpackhi_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vunpckhpd
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vunpckhpd
__m256d _mm256_unpackhi_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_unpackhi_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vunpckhpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vunpckhpd
__m512d _mm512_mask_unpackhi_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_unpackhi_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vunpckhpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vunpckhpd
__m512d _mm512_maskz_unpackhi_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_unpackhi_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vunpckhpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vunpckhpd
__m512d _mm512_unpackhi_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_unpackhi_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vunpckhpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
vunpckhps
__m128 _mm_mask_unpackhi_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_unpackhi_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vunpckhps
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the high half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vunpckhps
__m128 _mm_maskz_unpackhi_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_unpackhi_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vunpckhps
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the high half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
unpckhps
__m128 _mm_unpackhi_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_unpackhi_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: unpckhps xmm, xmm
CPUID Flags: SSE

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the high half a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vunpckhps
__m256 _mm256_mask_unpackhi_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_unpackhi_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vunpckhps
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vunpckhps
__m256 _mm256_maskz_unpackhi_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_unpackhi_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vunpckhps
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vunpckhps
__m256 _mm256_unpackhi_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_unpackhi_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vunpckhps ymm, ymm, ymm
CPUID Flags: AVX

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vunpckhps
__m512 _mm512_mask_unpackhi_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_unpackhi_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vunpckhps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vunpckhps
__m512 _mm512_maskz_unpackhi_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_unpackhi_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vunpckhps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vunpckhps
__m512 _mm512_unpackhi_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_unpackhi_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vunpckhps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
vpunpcklwd
__m128i _mm_mask_unpacklo_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_unpacklo_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 16-bit integers from the low half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[15:0] dst[31:16] := src2[15:0] dst[47:32] := src1[31:16] dst[63:48] := src2[31:16] dst[79:64] := src1[47:32] dst[95:80] := src2[47:32] dst[111:96] := src1[63:48] dst[127:112] := src2[63:48] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
vpunpcklwd
__m128i _mm_maskz_unpacklo_epi16 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_unpacklo_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 16-bit integers from the low half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[15:0] dst[31:16] := src2[15:0] dst[47:32] := src1[31:16] dst[63:48] := src2[31:16] dst[79:64] := src1[47:32] dst[95:80] := src2[47:32] dst[111:96] := src1[63:48] dst[127:112] := src2[63:48] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
punpcklwd
__m128i _mm_unpacklo_epi16 (__m128i a, __m128i b)

Synopsis

__m128i _mm_unpacklo_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpcklwd xmm, xmm
CPUID Flags: SSE2

Description

Unpack and interleave 16-bit integers from the low half of a and b, and store the results in dst.

Operation

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[15:0] dst[31:16] := src2[15:0] dst[47:32] := src1[31:16] dst[63:48] := src2[31:16] dst[79:64] := src1[47:32] dst[95:80] := src2[47:32] dst[111:96] := src1[63:48] dst[127:112] := src2[63:48] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpunpcklwd
__m256i _mm256_mask_unpacklo_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_unpacklo_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[15:0] dst[31:16] := src2[15:0] dst[47:32] := src1[31:16] dst[63:48] := src2[31:16] dst[79:64] := src1[47:32] dst[95:80] := src2[47:32] dst[111:96] := src1[63:48] dst[127:112] := src2[63:48] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
vpunpcklwd
__m256i _mm256_maskz_unpacklo_epi16 (__mmask16 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_unpacklo_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[15:0] dst[31:16] := src2[15:0] dst[47:32] := src1[31:16] dst[63:48] := src2[31:16] dst[79:64] := src1[47:32] dst[95:80] := src2[47:32] dst[111:96] := src1[63:48] dst[127:112] := src2[63:48] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) FOR j := 0 to 15 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpunpcklwd
__m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklwd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[15:0] dst[31:16] := src2[15:0] dst[47:32] := src1[31:16] dst[63:48] := src2[31:16] dst[79:64] := src1[47:32] dst[95:80] := src2[47:32] dst[111:96] := src1[63:48] dst[127:112] := src2[63:48] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpunpcklwd
__m512i _mm512_mask_unpacklo_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_unpacklo_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512BW

Description

Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[15:0] dst[31:16] := src2[15:0] dst[47:32] := src1[31:16] dst[63:48] := src2[31:16] dst[79:64] := src1[47:32] dst[95:80] := src2[47:32] dst[111:96] := src1[63:48] dst[127:112] := src2[63:48] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0
vpunpcklwd
__m512i _mm512_maskz_unpacklo_epi16 (__mmask32 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_unpacklo_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512BW

Description

Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[15:0] dst[31:16] := src2[15:0] dst[47:32] := src1[31:16] dst[63:48] := src2[31:16] dst[79:64] := src1[47:32] dst[95:80] := src2[47:32] dst[111:96] := src1[63:48] dst[127:112] := src2[63:48] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) FOR j := 0 to 31 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpunpcklwd
__m512i _mm512_unpacklo_epi16 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_unpacklo_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512BW

Description

Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){ dst[15:0] := src1[15:0] dst[31:16] := src2[15:0] dst[47:32] := src1[31:16] dst[63:48] := src2[31:16] dst[79:64] := src1[47:32] dst[95:80] := src2[47:32] dst[111:96] := src1[63:48] dst[127:112] := src2[63:48] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
vpunpckldq
__m128i _mm_mask_unpacklo_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_unpacklo_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckldq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 32-bit integers from the low half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpunpckldq
__m128i _mm_maskz_unpacklo_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_unpacklo_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckldq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 32-bit integers from the low half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
punpckldq
__m128i _mm_unpacklo_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_unpacklo_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpckldq xmm, xmm
CPUID Flags: SSE2

Description

Unpack and interleave 32-bit integers from the low half of a and b, and store the results in dst.

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpunpckldq
__m256i _mm256_mask_unpacklo_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_unpacklo_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckldq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpunpckldq
__m256i _mm256_maskz_unpacklo_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_unpacklo_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckldq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpunpckldq
__m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckldq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpunpckldq
__m512i _mm512_mask_unpacklo_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_unpacklo_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckldq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpunpckldq
__m512i _mm512_maskz_unpacklo_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_unpacklo_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckldq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpunpckldq
__m512i _mm512_unpacklo_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_unpacklo_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckldq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
vpunpcklqdq
__m128i _mm_mask_unpacklo_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_unpacklo_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpcklqdq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 64-bit integers from the low half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpunpcklqdq
__m128i _mm_maskz_unpacklo_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_unpacklo_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpcklqdq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 64-bit integers from the low half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
punpcklqdq
__m128i _mm_unpacklo_epi64 (__m128i a, __m128i b)

Synopsis

__m128i _mm_unpacklo_epi64 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpcklqdq xmm, xmm
CPUID Flags: SSE2

Description

Unpack and interleave 64-bit integers from the low half of a and b, and store the results in dst.

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpunpcklqdq
__m256i _mm256_mask_unpacklo_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_unpacklo_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklqdq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpunpcklqdq
__m256i _mm256_maskz_unpacklo_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_unpacklo_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklqdq
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpunpcklqdq
__m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklqdq ymm, ymm, ymm
CPUID Flags: AVX2

Description

Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpunpcklqdq
__m512i _mm512_mask_unpacklo_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_unpacklo_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklqdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpunpcklqdq
__m512i _mm512_maskz_unpacklo_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_unpacklo_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklqdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpunpcklqdq
__m512i _mm512_unpacklo_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_unpacklo_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklqdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
vpunpcklbw
__m128i _mm_mask_unpacklo_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_unpacklo_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 8-bit integers from the low half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[7:0] dst[15:8] := src2[7:0] dst[23:16] := src1[15:8] dst[31:24] := src2[15:8] dst[39:32] := src1[23:16] dst[47:40] := src2[23:16] dst[55:48] := src1[31:24] dst[63:56] := src2[31:24] dst[71:64] := src1[39:32] dst[79:72] := src2[39:32] dst[87:80] := src1[47:40] dst[95:88] := src2[47:40] dst[103:96] := src1[55:48] dst[111:104] := src2[55:48] dst[119:112] := src1[63:56] dst[127:120] := src2[63:56] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0
vpunpcklbw
__m128i _mm_maskz_unpacklo_epi8 (__mmask16 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_unpacklo_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 8-bit integers from the low half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[7:0] dst[15:8] := src2[7:0] dst[23:16] := src1[15:8] dst[31:24] := src2[15:8] dst[39:32] := src1[23:16] dst[47:40] := src2[23:16] dst[55:48] := src1[31:24] dst[63:56] := src2[31:24] dst[71:64] := src1[39:32] dst[79:72] := src2[39:32] dst[87:80] := src1[47:40] dst[95:88] := src2[47:40] dst[103:96] := src1[55:48] dst[111:104] := src2[55:48] dst[119:112] := src1[63:56] dst[127:120] := src2[63:56] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0
punpcklbw
__m128i _mm_unpacklo_epi8 (__m128i a, __m128i b)

Synopsis

__m128i _mm_unpacklo_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpcklbw xmm, xmm
CPUID Flags: SSE2

Description

Unpack and interleave 8-bit integers from the low half of a and b, and store the results in dst.

Operation

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[7:0] dst[15:8] := src2[7:0] dst[23:16] := src1[15:8] dst[31:24] := src2[15:8] dst[39:32] := src1[23:16] dst[47:40] := src2[23:16] dst[55:48] := src1[31:24] dst[63:56] := src2[31:24] dst[71:64] := src1[39:32] dst[79:72] := src2[39:32] dst[87:80] := src1[47:40] dst[95:88] := src2[47:40] dst[103:96] := src1[55:48] dst[111:104] := src2[55:48] dst[119:112] := src1[63:56] dst[127:120] := src2[63:56] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])

Performance

ArchitectureLatencyThroughput
Ivy Bridge10.5
Sandy Bridge10.5
Westmere10.5
Nehalem10.5
vpunpcklbw
__m256i _mm256_mask_unpacklo_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_unpacklo_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[7:0] dst[15:8] := src2[7:0] dst[23:16] := src1[15:8] dst[31:24] := src2[15:8] dst[39:32] := src1[23:16] dst[47:40] := src2[23:16] dst[55:48] := src1[31:24] dst[63:56] := src2[31:24] dst[71:64] := src1[39:32] dst[79:72] := src2[39:32] dst[87:80] := src1[47:40] dst[95:88] := src2[47:40] dst[103:96] := src1[55:48] dst[111:104] := src2[55:48] dst[119:112] := src1[63:56] dst[127:120] := src2[63:56] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0
vpunpcklbw
__m256i _mm256_maskz_unpacklo_epi8 (__mmask32 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_unpacklo_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512VL + AVX512BW

Description

Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[7:0] dst[15:8] := src2[7:0] dst[23:16] := src1[15:8] dst[31:24] := src2[15:8] dst[39:32] := src1[23:16] dst[47:40] := src2[23:16] dst[55:48] := src1[31:24] dst[63:56] := src2[31:24] dst[71:64] := src1[39:32] dst[79:72] := src2[39:32] dst[87:80] := src1[47:40] dst[95:88] := src2[47:40] dst[103:96] := src1[55:48] dst[111:104] := src2[55:48] dst[119:112] := src1[63:56] dst[127:120] := src2[63:56] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) FOR j := 0 to 31 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpunpcklbw
__m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklbw ymm, ymm, ymm
CPUID Flags: AVX2

Description

Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[7:0] dst[15:8] := src2[7:0] dst[23:16] := src1[15:8] dst[31:24] := src2[15:8] dst[39:32] := src1[23:16] dst[47:40] := src2[23:16] dst[55:48] := src1[31:24] dst[63:56] := src2[31:24] dst[71:64] := src1[39:32] dst[79:72] := src2[39:32] dst[87:80] := src1[47:40] dst[95:88] := src2[47:40] dst[103:96] := src1[55:48] dst[111:104] := src2[55:48] dst[119:112] := src1[63:56] dst[127:120] := src2[63:56] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
vpunpcklbw
__m512i _mm512_mask_unpacklo_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_unpacklo_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512BW

Description

Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[7:0] dst[15:8] := src2[7:0] dst[23:16] := src1[15:8] dst[31:24] := src2[15:8] dst[39:32] := src1[23:16] dst[47:40] := src2[23:16] dst[55:48] := src1[31:24] dst[63:56] := src2[31:24] dst[71:64] := src1[39:32] dst[79:72] := src2[39:32] dst[87:80] := src1[47:40] dst[95:88] := src2[47:40] dst[103:96] := src1[55:48] dst[111:104] := src2[55:48] dst[119:112] := src1[63:56] dst[127:120] := src2[63:56] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0
vpunpcklbw
__m512i _mm512_maskz_unpacklo_epi8 (__mmask64 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_unpacklo_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512BW

Description

Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[7:0] dst[15:8] := src2[7:0] dst[23:16] := src1[15:8] dst[31:24] := src2[15:8] dst[39:32] := src1[23:16] dst[47:40] := src2[23:16] dst[55:48] := src1[31:24] dst[63:56] := src2[31:24] dst[71:64] := src1[39:32] dst[79:72] := src2[39:32] dst[87:80] := src1[47:40] dst[95:88] := src2[47:40] dst[103:96] := src1[55:48] dst[111:104] := src2[55:48] dst[119:112] := src1[63:56] dst[127:120] := src2[63:56] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) FOR j := 0 to 63 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpunpcklbw
__m512i _mm512_unpacklo_epi8 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_unpacklo_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512BW

Description

Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){ dst[7:0] := src1[7:0] dst[15:8] := src2[7:0] dst[23:16] := src1[15:8] dst[31:24] := src2[15:8] dst[39:32] := src1[23:16] dst[47:40] := src2[23:16] dst[55:48] := src1[31:24] dst[63:56] := src2[31:24] dst[71:64] := src1[39:32] dst[79:72] := src2[39:32] dst[87:80] := src1[47:40] dst[95:88] := src2[47:40] dst[103:96] := src1[55:48] dst[111:104] := src2[55:48] dst[119:112] := src1[63:56] dst[127:120] := src2[63:56] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) dst[MAX:512] := 0
vunpcklpd
__m128d _mm_mask_unpacklo_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_unpacklo_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vunpcklpd
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the low half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vunpcklpd
__m128d _mm_maskz_unpacklo_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_unpacklo_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vunpcklpd
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the low half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
unpcklpd
__m128d _mm_unpacklo_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_unpacklo_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: unpcklpd xmm, xmm
CPUID Flags: SSE2

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the low half of a and b, and store the results in dst.

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vunpcklpd
__m256d _mm256_mask_unpacklo_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_unpacklo_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vunpcklpd
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vunpcklpd
__m256d _mm256_maskz_unpacklo_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_unpacklo_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vunpcklpd
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vunpcklpd
__m256d _mm256_unpacklo_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_unpacklo_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vunpcklpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vunpcklpd
__m512d _mm512_mask_unpacklo_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_unpacklo_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vunpcklpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vunpcklpd
__m512d _mm512_maskz_unpacklo_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_unpacklo_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vunpcklpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vunpcklpd
__m512d _mm512_unpacklo_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_unpacklo_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vunpcklpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
vunpcklps
__m128 _mm_mask_unpacklo_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_unpacklo_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vunpcklps
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the low half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vunpcklps
__m128 _mm_maskz_unpacklo_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_unpacklo_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vunpcklps
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the low half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
unpcklps
__m128 _mm_unpacklo_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_unpacklo_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: unpcklps xmm, xmm
CPUID Flags: SSE

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the low half of a and b, and store the results in dst.

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
Westmere11
Nehalem11
vunpcklps
__m256 _mm256_mask_unpacklo_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_unpacklo_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vunpcklps
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vunpcklps
__m256 _mm256_maskz_unpacklo_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_unpacklo_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vunpcklps
CPUID Flags: AVX512VL + AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vunpcklps
__m256 _mm256_unpacklo_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_unpacklo_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vunpcklps ymm, ymm, ymm
CPUID Flags: AVX

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vunpcklps
__m512 _mm512_mask_unpacklo_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_unpacklo_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vunpcklps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vunpcklps
__m512 _mm512_maskz_unpacklo_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_unpacklo_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vunpcklps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vunpcklps
__m512 _mm512_unpacklo_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_unpacklo_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vunpcklps zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.

Operation

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
...
__m128i _mm_urem_epi32 (__m128i a, __m128i b)

Synopsis

__m128i _mm_urem_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 3 i := 32*j dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0
...
__m256i _mm256_urem_epi32 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_urem_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX

Description

Divide packed unsigned 32-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.

Operation

FOR j := 0 to 7 i := 32*j dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:256] := 0
wrfsbase
void _writefsbase_u32 (unsigned int a)

Synopsis

void _writefsbase_u32 (unsigned int a)
#include "immintrin.h"
Instruction: wrfsbase r32
CPUID Flags: FSGSBASE

Description

Write the unsigned 32-bit integer a to the FS segment base register.

Operation

FS_Segment_Base_Register[31:0] := a[31:0]; FS_Segment_Base_Register[63:32] := 0
wrfsbase
void _writefsbase_u64 (unsigned __int64 a)

Synopsis

void _writefsbase_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: wrfsbase r64
CPUID Flags: FSGSBASE

Description

Write the unsigned 64-bit integer a to the FS segment base register.

Operation

FS_Segment_Base_Register[63:0] := a[63:0];
wrgsbase
void _writegsbase_u32 (unsigned int a)

Synopsis

void _writegsbase_u32 (unsigned int a)
#include "immintrin.h"
Instruction: wrgsbase r32
CPUID Flags: FSGSBASE

Description

Write the unsigned 32-bit integer a to the GS segment base register.

Operation

GS_Segment_Base_Register[31:0] := a[31:0]; GS_Segment_Base_Register[63:32] := 0
wrgsbase
void _writegsbase_u64 (unsigned __int64 a)

Synopsis

void _writegsbase_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: wrgsbase r32
CPUID Flags: FSGSBASE

Description

Write the unsigned 64-bit integer a to the GS segment base register.

Operation

GS_Segment_Base_Register[63:0] := a[63:0];
xabort
void _xabort (const unsigned int imm8)

Synopsis

void _xabort (const unsigned int imm8)
#include "immintrin.h"
Instruction: xabort imm
CPUID Flags: RTM

Description

Force an RTM abort. The EAX register is updated to reflect an XABORT instruction caused the abort, and the imm8 parameter will be provided in bits [31:24] of EAX. Following an RTM abort, the logical processor resumes execution at the fallback address computed through the outermost XBEGIN instruction.

Operation

IF RTM_ACTIVE = 0 // nop ELSE // restore architectural register state // discard memory updates performed in transaction // update EAX with status and imm8 value RTM_NEST_COUNT := 0 RTM_ACTIVE := 0 IF 64-bit Mode RIP := fallbackRIP ELSE EIP := fallbackEIP FI FI
xbegin
unsigned int _xbegin (void)

Synopsis

unsigned int _xbegin (void)
#include "immintrin.h"
Instruction: xbegin
CPUID Flags: RTM

Description

Specify the start of an RTM code region. If the logical processor was not already in transactional execution, then this call causes the logical processor to transition into transactional execution. On an RTM abort, the logical processor discards all architectural register and memory updates performed during the RTM execution, restores architectural state, and starts execution beginning at the fallback address computed from the outermost XBEGIN instruction.

Operation

IF RTM_NEST_COUNT < MAX_RTM_NEST_COUNT RTM_NEST_COUNT := RTM_NEST_COUNT + 1 IF RTM_NEST_COUNT = 1 IF 64-bit Mode fallbackRIP := RIP + SignExtend(IMM) ELSE IF 32-bit Mode fallbackEIP := EIP + SignExtend(IMM) ELSE // 16-bit Mode fallbackEIP := (EIP + SignExtend(IMM)) AND 0x0000FFFF FI RTM_ACTIVE := 1 // enter RTM execution, record register state, start tracking memory state FI ELSE // RTM abort (see _xabort) FI
xend
void _xend (void)

Synopsis

void _xend (void)
#include "immintrin.h"
Instruction: xend
CPUID Flags: RTM

Description

Specify the end of an RTM code region. If this corresponds to the outermost scope, the logical processor will attempt to commit the logical processor state atomically. If the commit fails, the logical processor will perform an RTM abort.

Operation

IF RTM_ACTIVE = 1 RTM_NEST_COUNT := RTM_NEST_COUNT - 1 IF RTM_NEST_COUNT = 0 // try to commit transaction IF fail to commit transaction // RTM abort (see _xabort) ELSE RTM_ACTIVE = 0 FI FI FI
xgetbv
unsigned __int64 _xgetbv (unsigned int a)

Synopsis

unsigned __int64 _xgetbv (unsigned int a)
#include "immintrin.h"
Instruction: xgetbv
CPUID Flags: XSAVE

Description

Copy up to 64-bits from the value of the extended control register (XCR) specified by a into dst. Currently only XFEATURE_ENABLED_MASK XCR is supported.

Operation

dst[63:0] := XCR[a]
vpxord
__m128i _mm_mask_xor_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_xor_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpxord
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vpxord
__m128i _mm_maskz_xor_epi32 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_xor_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpxord
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpxord
__m256i _mm256_mask_xor_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_xor_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpxord
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vpxord
__m256i _mm256_maskz_xor_epi32 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_xor_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpxord
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpxord
__m512i _mm512_mask_xor_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_xor_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxord zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vpxord
__m512i _mm512_maskz_xor_epi32 (__mmask16 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_xor_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxord zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpxord
__m512i _mm512_xor_epi32 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_xor_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxord zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ENDFOR dst[MAX:512] := 0
vpxorq
__m128i _mm_mask_xor_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_xor_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpxorq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vpxorq
__m128i _mm_maskz_xor_epi64 (__mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_maskz_xor_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpxorq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
vpxorq
__m256i _mm256_mask_xor_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_mask_xor_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpxorq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vpxorq
__m256i _mm256_maskz_xor_epi64 (__mmask8 k, __m256i a, __m256i b)

Synopsis

__m256i _mm256_maskz_xor_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpxorq
CPUID Flags: AVX512VL + AVX512F

Description

Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vpxorq
__m512i _mm512_mask_xor_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_mask_xor_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vpxorq
__m512i _mm512_maskz_xor_epi64 (__mmask8 k, __m512i a, __m512i b)

Synopsis

__m512i _mm512_maskz_xor_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F

Description

Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vpxorq
__m512i _mm512_xor_epi64 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_xor_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ENDFOR dst[MAX:512] := 0
vxorpd
__m128d _mm_mask_xor_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_mask_xor_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
vxorpd
__m128d _mm_maskz_xor_pd (__mmask8 k, __m128d a, __m128d b)

Synopsis

__m128d _mm_maskz_xor_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
xorpd
__m128d _mm_xor_pd (__m128d a, __m128d b)

Synopsis

__m128d _mm_xor_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: xorpd xmm, xmm
CPUID Flags: SSE2

Description

Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 1 i := j*64 dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.8
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vxorpd
__m256d _mm256_mask_xor_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_mask_xor_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
vxorpd
__m256d _mm256_maskz_xor_pd (__mmask8 k, __m256d a, __m256d b)

Synopsis

__m256d _mm256_maskz_xor_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
vxorpd
__m256d _mm256_xor_pd (__m256d a, __m256d b)

Synopsis

__m256d _mm256_xor_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vxorpd ymm, ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*64 dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vxorpd
__m512d _mm512_mask_xor_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_mask_xor_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512DQ

Description

Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
vxorpd
__m512d _mm512_maskz_xor_pd (__mmask8 k, __m512d a, __m512d b)

Synopsis

__m512d _mm512_maskz_xor_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512DQ

Description

Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
vxorpd
__m512d _mm512_xor_pd (__m512d a, __m512d b)

Synopsis

__m512d _mm512_xor_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512DQ

Description

Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ENDFOR dst[MAX:512] := 0
vxorps
__m128 _mm_mask_xor_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_mask_xor_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
vxorps
__m128 _mm_maskz_xor_ps (__mmask8 k, __m128 a, __m128 b)

Synopsis

__m128 _mm_maskz_xor_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
xorps
__m128 _mm_xor_ps (__m128 a, __m128 b)

Synopsis

__m128 _mm_xor_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: xorps xmm, xmm
CPUID Flags: SSE

Description

Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 3 i := j*32 dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ENDFOR

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vxorps
__m256 _mm256_mask_xor_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_mask_xor_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
vxorps
__m256 _mm256_maskz_xor_ps (__mmask8 k, __m256 a, __m256 b)

Synopsis

__m256 _mm256_maskz_xor_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512VL + AVX512DQ

Description

Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
vxorps
__m256 _mm256_xor_ps (__m256 a, __m256 b)

Synopsis

__m256 _mm256_xor_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vxorps ymm, ymm, ymm
CPUID Flags: AVX

Description

Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*32 dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ENDFOR dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell11
Ivy Bridge11
Sandy Bridge11
vxorps
__m512 _mm512_mask_xor_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_mask_xor_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512DQ

Description

Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
vxorps
__m512 _mm512_maskz_xor_ps (__mmask16 k, __m512 a, __m512 b)

Synopsis

__m512 _mm512_maskz_xor_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512DQ

Description

Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).

Operation

FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
vxorps
__m512 _mm512_xor_ps (__m512 a, __m512 b)

Synopsis

__m512 _mm512_xor_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512DQ

Description

Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.

Operation

FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ENDFOR dst[MAX:512] := 0
pxor
__m128i _mm_xor_si128 (__m128i a, __m128i b)

Synopsis

__m128i _mm_xor_si128 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pxor xmm, xmm
CPUID Flags: SSE2

Description

Compute the bitwise XOR of 128 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[127:0] := (a[127:0] XOR b[127:0])

Performance

ArchitectureLatencyThroughput
Haswell10.33
Ivy Bridge10.33
Sandy Bridge10.33
Westmere10.33
Nehalem10.33
vpxor
__m256i _mm256_xor_si256 (__m256i a, __m256i b)

Synopsis

__m256i _mm256_xor_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpxor ymm, ymm, ymm
CPUID Flags: AVX2

Description

Compute the bitwise XOR of 256 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[255:0] := (a[255:0] XOR b[255:0]) dst[MAX:256] := 0

Performance

ArchitectureLatencyThroughput
Haswell1-
vpxord
__m512i _mm512_xor_si512 (__m512i a, __m512i b)

Synopsis

__m512i _mm512_xor_si512 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxord zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC

Description

Compute the bitwise XOR of 512 bits (representing integer data) in a and b, and store the result in dst.

Operation

dst[511:0] := (a[511:0] XOR b[511:0]) dst[MAX:512] := 0
xrstor
void _xrstor (void * mem_addr, unsigned __int64 rs_mask)

Synopsis

void _xrstor (void * mem_addr, unsigned __int64 rs_mask)
#include "immintrin.h"
Instruction: xrstor MEMmxsave
CPUID Flags: XSAVE

Description

Perform a full or partial restore of the enabled processor states using the state information stored in memory at mem_addr. State is restored based on bits [62:0] in rs_mask, XCR0, and mem_addr.HEADER.XSTATE_BV. mem_addr must be aligned on a 64-byte boundary.

Operation

st_mask = mem_addr.HEADER.XSTATE_BV[62:0] FOR i := 0 to 62 IF (rs_mask[i] AND XCR0[i]) IF st_mask[i] CASE (i) OF 0: ProcessorState[x87 FPU] := mem_addr.FPUSSESave_Area[FPU] 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] ESAC ELSE // ProcessorExtendedState := Processor Supplied Values CASE (i) OF 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] ESAC FI FI i := i + 1 ENDFOR
xrstor64
void _xrstor64 (void * mem_addr, unsigned __int64 rs_mask)

Synopsis

void _xrstor64 (void * mem_addr, unsigned __int64 rs_mask)
#include "immintrin.h"
Instruction: xrstor64 MEMmxsave
CPUID Flags: XSAVE

Description

Perform a full or partial restore of the enabled processor states using the state information stored in memory at mem_addr. State is restored based on bits [62:0] in rs_mask, XCR0, and mem_addr.HEADER.XSTATE_BV. mem_addr must be aligned on a 64-byte boundary.

Operation

st_mask = mem_addr.HEADER.XSTATE_BV[62:0] FOR i := 0 to 62 IF (rs_mask[i] AND XCR0[i]) IF st_mask[i] CASE (i) OF 0: ProcessorState[x87 FPU] := mem_addr.FPUSSESave_Area[FPU] 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] ESAC ELSE // ProcessorExtendedState := Processor Supplied Values CASE (i) OF 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] ESAC FI FI i := i + 1 ENDFOR
xrstors
void _xrstors (const void * mem_addr, unsigned __int64 rs_mask)

Synopsis

void _xrstors (const void * mem_addr, unsigned __int64 rs_mask)
#include "immintrin.h"
Instruction: xrstors MEMmxsave
CPUID Flags: XSAVE + XSS

Description

Perform a full or partial restore of the enabled processor states using the state information stored in memory at mem_addr. xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in rs_mask, XCR0, and mem_addr.HEADER.XSTATE_BV. mem_addr must be aligned on a 64-byte boundary.

Operation

st_mask = mem_addr.HEADER.XSTATE_BV[62:0] FOR i := 0 to 62 IF (rs_mask[i] AND XCR0[i]) IF st_mask[i] CASE (i) OF 0: ProcessorState[x87 FPU] := mem_addr.FPUSSESave_Area[FPU] 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] ESAC ELSE // ProcessorExtendedState := Processor Supplied Values CASE (i) OF 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] ESAC FI FI i := i + 1 ENDFOR
xrstors64
void _xrstors64 (const void * mem_addr, unsigned __int64 rs_mask)

Synopsis

void _xrstors64 (const void * mem_addr, unsigned __int64 rs_mask)
#include "immintrin.h"
Instruction: xrstors64 MEMmxsave
CPUID Flags: XSAVE + XSS

Description

Perform a full or partial restore of the enabled processor states using the state information stored in memory at mem_addr. xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in rs_mask, XCR0, and mem_addr.HEADER.XSTATE_BV. mem_addr must be aligned on a 64-byte boundary.

Operation

st_mask = mem_addr.HEADER.XSTATE_BV[62:0] FOR i := 0 to 62 IF (rs_mask[i] AND XCR0[i]) IF st_mask[i] CASE (i) OF 0: ProcessorState[x87 FPU] := mem_addr.FPUSSESave_Area[FPU] 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] ESAC ELSE // ProcessorExtendedState := Processor Supplied Values CASE (i) OF 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] ESAC FI FI i := i + 1 ENDFOR
xsave
void _xsave (void * mem_addr, unsigned __int64 save_mask)

Synopsis

void _xsave (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsave MEMmxsave
CPUID Flags: XSAVE

Description

Perform a full or partial save of the enabled processor states to memory at mem_addr. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary.

Operation

mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0] FOR i := 0 to 62 IF mask[i] CASE (i) OF 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU] 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] ESAC mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] FI i := i + 1 ENDFOR
xsave64
void _xsave64 (void * mem_addr, unsigned __int64 save_mask)

Synopsis

void _xsave64 (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsave64 MEMmxsave
CPUID Flags: XSAVE

Description

Perform a full or partial save of the enabled processor states to memory at mem_addr. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary.

Operation

mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0] FOR i := 0 to 62 IF mask[i] CASE (i) OF 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU] 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] ESAC mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] FI i := i + 1 ENDFOR
xsavec
void _xsavec (void * mem_addr, unsigned __int64 save_mask)

Synopsis

void _xsavec (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsavec MEMmxsave
CPUID Flags: XSAVE + XSAVEC

Description

Perform a full or partial save of the enabled processor states to memory at mem_addr; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary.

Operation

mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0] FOR i := 0 to 62 IF mask[i] CASE (i) OF 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU] 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] ESAC mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] FI i := i + 1 ENDFOR
xsavec64
void _xsavec64 (void * mem_addr, unsigned __int64 save_mask)

Synopsis

void _xsavec64 (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsavec64 MEMmxsave
CPUID Flags: XSAVE + XSAVEC

Description

Perform a full or partial save of the enabled processor states to memory at mem_addr; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary.

Operation

mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0] FOR i := 0 to 62 IF mask[i] CASE (i) OF 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU] 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] ESAC mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] FI i := i + 1 ENDFOR
xsaveopt
void _xsaveopt (void * mem_addr, unsigned __int64 save_mask)

Synopsis

void _xsaveopt (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsaveopt MEMmxsave
CPUID Flags: XSAVE + XSAVEOPT

Description

Perform a full or partial save of the enabled processor states to memory at mem_addr. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE instruction.

Operation

mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0] FOR i := 0 to 62 IF mask[i] CASE (i) OF 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU] 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] 2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM] DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] ESAC mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] FI i := i + 1 ENDFOR
xsaveopt64
void _xsaveopt64 (void * mem_addr, unsigned __int64 save_mask)

Synopsis

void _xsaveopt64 (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsaveopt64 MEMmxsave
CPUID Flags: XSAVE + XSAVEOPT

Description

Perform a full or partial save of the enabled processor states to memory at mem_addr. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE64 instruction.

Operation

mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0] FOR i := 0 to 62 IF mask[i] CASE (i) OF 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU] 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] 2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM] DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] ESAC mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] FI i := i + 1 ENDFOR
xsaves
void _xsaves (void * mem_addr, unsigned __int64 save_mask)

Synopsis

void _xsaves (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsaves MEMmxsave
CPUID Flags: XSAVE + XSS

Description

Perform a full or partial save of the enabled processor states to memory at mem_addr; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary.

Operation

mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0] FOR i := 0 to 62 IF mask[i] CASE (i) OF 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU] 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] ESAC mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] FI i := i + 1 ENDFOR
xsavec64
void _xsaves64 (void * mem_addr, unsigned __int64 save_mask)

Synopsis

void _xsaves64 (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsavec64 MEMmxsave
CPUID Flags: XSAVE + XSS

Description

Perform a full or partial save of the enabled processor states to memory at mem_addr; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary.

Operation

mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0] FOR i := 0 to 62 IF mask[i] CASE (i) OF 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU] 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] ESAC mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] FI i := i + 1 ENDFOR
xsetbv
void _xsetbv (unsigned int a, unsigned __int64 val)

Synopsis

void _xsetbv (unsigned int a, unsigned __int64 val)
#include "immintrin.h"
Instruction: xsetbv
CPUID Flags: XSAVE

Description

Copy 64-bits from val to the extended control register (XCR) specified by a. Currently only XFEATURE_ENABLED_MASK XCR is supported.

Operation

XCR[a] := val[63:0]
xtest
unsigned char _xtest (void)

Synopsis

unsigned char _xtest (void)
#include "immintrin.h"
Instruction: xtest
CPUID Flags: RTM

Description

Query the transactional execution status, return 0 if inside a transactionally executing RTM or HLE region, and return 1 otherwise.

Operation

IF (RTM_ACTIVE = 1 OR HLE_ACTIVE = 1) dst := 0 ELSE dst := 1 FI
vzeroall
void _mm256_zeroall (void)

Synopsis

void _mm256_zeroall (void)
#include "immintrin.h"
Instruction: vzeroall
CPUID Flags: AVX

Description

Zero the contents of all XMM or YMM registers.

Operation

YMM0[MAX:0] := 0 YMM1[MAX:0] := 0 YMM2[MAX:0] := 0 YMM3[MAX:0] := 0 YMM4[MAX:0] := 0 YMM5[MAX:0] := 0 YMM6[MAX:0] := 0 YMM7[MAX:0] := 0 IF 64-bit mode YMM8[MAX:0] := 0 YMM9[MAX:0] := 0 YMM10[MAX:0] := 0 YMM11[MAX:0] := 0 YMM12[MAX:0] := 0 YMM13[MAX:0] := 0 YMM14[MAX:0] := 0 YMM15[MAX:0] := 0 FI
vzeroupper
void _mm256_zeroupper (void)

Synopsis

void _mm256_zeroupper (void)
#include "immintrin.h"
Instruction: vzeroupper
CPUID Flags: AVX

Description

Zero the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified.

Operation

YMM0[MAX:128] := 0 YMM1[MAX:128] := 0 YMM2[MAX:128] := 0 YMM3[MAX:128] := 0 YMM4[MAX:128] := 0 YMM5[MAX:128] := 0 YMM6[MAX:128] := 0 YMM7[MAX:128] := 0 IF 64-bit mode YMM8[MAX:128] := 0 YMM9[MAX:128] := 0 YMM10[MAX:128] := 0 YMM11[MAX:128] := 0 YMM12[MAX:128] := 0 YMM13[MAX:128] := 0 YMM14[MAX:128] := 0 YMM15[MAX:128] := 0 FI

Performance

ArchitectureLatencyThroughput
Haswell01
Ivy Bridge01
Sandy Bridge01